drbh HF Staff commited on
Commit
3b25788
·
verified ·
1 Parent(s): dac61af

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. activation/impls/artifacts/benchmark/activation.jsonl +9 -9
  2. activation/impls/cells/benchmark.py +2 -2
  3. activation/impls/cells/sysinfo.py +14 -0
  4. activation/impls/hf_kernels_swiglu.html +96 -95
  5. activation/impls/index.html +1 -2
  6. activation/impls/torch_swiglu.html +120 -120
  7. activation/impls/torch_swiglu_darwin.html +0 -0
  8. activation/index.html +1 -1
  9. activation/results_darwin/artifacts/combine/latency.svg +3 -0
  10. activation/results_darwin/cells/combine.py +25 -0
  11. activation/results_darwin/combined_results.html +0 -0
  12. activation/results_darwin/index.html +88 -0
  13. activation/results_linux/artifacts/combine/latency.svg +2 -2
  14. activation/results_linux/combined_results.html +85 -111
  15. causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl +24 -24
  16. causal_conv1d/impls/hf_kernels_causal_conv1d.html +0 -0
  17. causal_conv1d/impls/torch_causal_conv1d.html +0 -0
  18. causal_conv1d/results/artifacts/combine/latency.svg +2 -2
  19. causal_conv1d/results/combined_results.html +131 -131
  20. deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl +4 -4
  21. deformable_detr/impls/cells/benchmark.py +18 -94
  22. deformable_detr/impls/hf_kernels_deformable_detr.html +78 -78
  23. deformable_detr/impls/torch_deformable_detr.html +103 -97
  24. deformable_detr/results/artifacts/combine/latency.svg +2 -2
  25. deformable_detr/results/combined_results.html +56 -56
  26. flash_attn/impls/artifacts/benchmark/attention.jsonl +6 -6
  27. flash_attn/impls/cells/benchmark.py +8 -9
  28. flash_attn/impls/flash_attention.html +140 -140
  29. flash_attn/impls/hf_kernels_flash_attn.html +93 -93
  30. flash_attn/impls/hf_kernels_flash_attn3.html +82 -83
  31. flash_attn/impls/mem_efficient_attention.html +134 -140
  32. flash_attn/impls/sage_attention.html +10 -12
  33. flash_attn/impls/xformers.html +90 -90
  34. flash_attn/results/artifacts/combine/latency.svg +2 -2
  35. flash_attn/results/combined_results.html +143 -143
  36. index.html +1 -1
  37. layer_norm/impls/artifacts/benchmark/layer_norm.jsonl +4 -4
  38. layer_norm/impls/hf_kernels_layer_norm.html +54 -57
  39. layer_norm/impls/torch_layer_norm.html +54 -100
  40. layer_norm/results/artifacts/combine/latency.svg +2 -2
  41. layer_norm/results/combined_results.html +52 -52
  42. openai_moe/impls/artifacts/benchmark/openai_moe.jsonl +8 -8
  43. openai_moe/impls/binned_torch.html +189 -189
  44. openai_moe/impls/gpt_oss_moe.html +191 -191
  45. openai_moe/results/artifacts/combine/latency.svg +2 -2
  46. openai_moe/results/combined_results.html +240 -188
  47. rotary/impls/artifacts/benchmark/rotary.jsonl +24 -24
  48. rotary/impls/hf_kernels_rotary.html +0 -0
  49. rotary/impls/torch_rotary.html +0 -0
  50. rotary/index.html +1 -1
activation/impls/artifacts/benchmark/activation.jsonl CHANGED
@@ -1,9 +1,9 @@
1
- {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04233100003148138, "p50": 0.043751000021075015, "p90": 0.044161999994685175, "mean": 0.04361539999990782, "iqr": 0.001740000016070553, "raw_times": [0.044161999994685175, 0.04541099997368292, 0.04242199997861462, 0.043751000021075015, 0.04233100003148138], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05063100002189458, "peak_bytes": 1966080, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
2
- {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.054010999974707374, "p50": 0.05540200004361395, "p90": 0.05709199990633351, "mean": 0.057631800018498325, "iqr": 0.0019599997358454857, "raw_times": [0.054010999974707374, 0.05513200017048803, 0.06652199999734876, 0.05540200004361395, 0.05709199990633351], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05926099993303069, "peak_bytes": 2621440, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
3
- {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05346099987946218, "p50": 0.054341999884854886, "p90": 0.05543199995372561, "mean": 0.054953799917711876, "iqr": 0.001390000079481979, "raw_times": [0.05346099987946218, 0.05543199995372561, 0.05749199999627308, 0.054341999884854886, 0.05404199987424363], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05924099991716503, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
4
- {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.053281999953469494, "p50": 0.054581999847869156, "p90": 0.05551200001718826, "mean": 0.054651799973726156, "iqr": 0.0014510001165035646, "raw_times": [0.05406099990068469, 0.05582200014941918, 0.05551200001718826, 0.054581999847869156, 0.053281999953469494], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05814099995404831, "peak_bytes": 3932160, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
5
- {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05309099992700794, "p50": 0.05449100012810959, "p90": 0.05478200000652578, "mean": 0.05435540001599293, "iqr": 0.0010310000106983352, "raw_times": [0.05449100012810959, 0.055662000022493885, 0.05375099999582744, 0.05309099992700794, 0.05478200000652578], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057451999964541756, "peak_bytes": 5242880, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
6
- {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.051550999842220335, "p50": 0.052460999995673774, "p90": 0.05307099991114228, "mean": 0.05247719996077649, "iqr": 0.000889999910214101, "raw_times": [0.051550999842220335, 0.05307099991114228, 0.05218100000092818, 0.052460999995673774, 0.05312200005391787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.07207299995570793, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
7
- {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052890999995725, "p50": 0.05325200004335784, "p90": 0.054772000112279784, "mean": 0.053839400061406195, "iqr": 0.001821000068957801, "raw_times": [0.05295100004332198, 0.054772000112279784, 0.05325200004335784, 0.052890999995725, 0.055331000112346373], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05688200008080457, "peak_bytes": 7864320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
8
- {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05233100000623381, "p50": 0.054522000027645845, "p90": 0.05475100010698952, "mean": 0.05385140002545086, "iqr": 0.0020300001324358163, "raw_times": [0.052720999974553706, 0.05475100010698952, 0.05233100000623381, 0.054932000011831406, 0.054522000027645845], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056971000049088616, "peak_bytes": 10485760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
9
- {"ts": "2025-12-19T19:54:24Z", "run": "9f2d48bd832042b09eca95e62da3c0de", "impl": "torch_eager", "tags": {"family": "hf-kernels", "backend": "eager"}, "wl": {"name": "cuda_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.052550999953382416, "p50": 0.05365099991649913, "p90": 0.053941000032864395, "mean": 0.053534999960902496, "iqr": 0.0006200000370881753, "raw_times": [0.05421099990599032, 0.05365099991649913, 0.052550999953382416, 0.053941000032864395, 0.05332099999577622], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.058042000091518275, "peak_bytes": 20971520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "swiglu_fp32"}, "err": null}
 
1
+ {"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D768", "num_tokens": 128, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.13450000000148066, "p50": 0.1411669999811238, "p90": 0.1532919999931437, "mean": 0.1477000000022599, "iqr": 0.017083999978240172, "raw_times": [0.13620800001490352, 0.1733330000206479, 0.1532919999931437, 0.1411669999811238, 0.13450000000148066], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.1447500000040236, "peak_bytes": null, "ok": false, "absmax": 0.04913330078125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.04913330078125, "mae": 0.0008915023063309491, "mse": 4.496400833886582e-06, "ref": "swiglu_fp32"}, "err": null}
2
+ {"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D1024", "num_tokens": 128, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.1742909999506992, "p50": 0.17550000001165245, "p90": 0.17633400000249821, "mean": 0.17563320000135718, "iqr": 0.001000999986899842, "raw_times": [0.1742909999506992, 0.17633400000249821, 0.17533300001559837, 0.17670800002633769, 0.17550000001165245], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.731916999995974, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008884685230441391, "mse": 4.475335117604118e-06, "ref": "swiglu_fp32"}, "err": null}
3
+ {"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T128_D2048", "num_tokens": 128, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.35966699999789853, "p50": 0.3839590000325188, "p90": 0.4197920000024169, "mean": 0.3930668000066362, "iqr": 0.05745900000420079, "raw_times": [0.35966699999789853, 0.3623329999982161, 0.4395830000021306, 0.4197920000024169, 0.3839590000325188], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.37070900003755014, "peak_bytes": null, "ok": false, "absmax": 0.07091712951660156, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.07091712951660156, "mae": 0.0008893357589840889, "mse": 4.469751274882583e-06, "ref": "swiglu_fp32"}, "err": null}
4
+ {"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D768", "num_tokens": 256, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.27337500000612636, "p50": 0.325791999955527, "p90": 0.3564579999988382, "mean": 0.5360415999916768, "iqr": 0.03887500002974775, "raw_times": [1.4070000000288019, 0.3564579999988382, 0.325791999955527, 0.27337500000612636, 0.31758299996909045], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 1.2649170000145205, "peak_bytes": null, "ok": false, "absmax": 0.04913330078125, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.04913330078125, "mae": 0.0008873133920133114, "mse": 4.3958548303635325e-06, "ref": "swiglu_fp32"}, "err": null}
5
+ {"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D1024", "num_tokens": 256, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.3514999999651991, "p50": 0.39737500003411697, "p90": 0.42058299999325754, "mean": 0.44304979999196803, "iqr": 0.05525000000261571, "raw_times": [0.42058299999325754, 0.39737500003411697, 0.6804579999766247, 0.3514999999651991, 0.36533299999064184], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 1.976333000015984, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008889895398169756, "mse": 4.431089109857567e-06, "ref": "swiglu_fp32"}, "err": null}
6
+ {"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T256_D2048", "num_tokens": 256, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.9706249999794636, "p50": 0.9802499999977954, "p90": 3.842000000020107, "mean": 2.413258199999291, "iqr": 2.863209000054212, "raw_times": [3.842000000020107, 5.294625000033193, 0.9802499999977954, 0.978790999965895, 0.9706249999794636], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 2.3860840000224925, "peak_bytes": null, "ok": false, "absmax": 0.08395957946777344, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.08395957946777344, "mae": 0.0008889408782124519, "mse": 4.476671620068373e-06, "ref": "swiglu_fp32"}, "err": null}
7
+ {"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D768", "num_tokens": 512, "hidden_dim": 768, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 0.6639999999720203, "p50": 0.8687079999845082, "p90": 1.1298749999468782, "mean": 0.9603583999933107, "iqr": 0.2749159999098083, "raw_times": [0.8549590000370699, 1.284250000026077, 1.1298749999468782, 0.6639999999720203, 0.8687079999845082], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.7134589999964192, "peak_bytes": null, "ok": false, "absmax": 0.05687236785888672, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.05687236785888672, "mae": 0.0008884922135621309, "mse": 4.399109002406476e-06, "ref": "swiglu_fp32"}, "err": null}
8
+ {"ts": "2025-12-19T22:43:48Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D1024", "num_tokens": 512, "hidden_dim": 1024, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 1.141958000005161, "p50": 1.6311670000277445, "p90": 1.6544580000186215, "mean": 1.7749248000086482, "iqr": 0.366167000038331, "raw_times": [1.6544580000186215, 1.2882909999802905, 3.1587500000114233, 1.6311670000277445, 1.141958000005161], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 2.0730410000169286, "peak_bytes": null, "ok": false, "absmax": 0.06802082061767578, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.06802082061767578, "mae": 0.0008890957687981427, "mse": 4.448749677976593e-06, "ref": "swiglu_fp32"}, "err": null}
9
+ {"ts": "2025-12-19T22:43:49Z", "run": "4f266502732344bb9d0a3cd9527f873a", "impl": "torch_eager_darwin", "tags": {"family": "pytorch", "backend": "eager", "platform": "darwin"}, "wl": {"name": "cpu_T512_D2048", "num_tokens": 512, "hidden_dim": 2048, "dtype": "float32", "device": "cpu", "seed": 0}, "env": {"torch": "2.8.0", "cuda": "", "gpu": "", "sm": "", "py": "3.11.14", "plat": "macOS-15.7.2-arm64-arm-64bit"}, "lat_ms": {"p10": 2.664708999986942, "p50": 3.365374999987125, "p90": 3.6645420000240847, "mean": 3.5541085999966526, "iqr": 0.8831670000404301, "raw_times": [2.664708999986942, 3.6645420000240847, 3.365374999987125, 5.2945420000014565, 2.7813749999836546], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 17.938291999996636, "peak_bytes": null, "ok": false, "absmax": 0.09098148345947266, "corr": {"ok": false, "rtol": 0.001, "atol": 0.001, "absmax": 0.09098148345947266, "mae": 0.0008892239420674741, "mse": 4.500504473980982e-06, "ref": "swiglu_fp32"}, "err": null}
activation/impls/cells/benchmark.py CHANGED
@@ -22,7 +22,7 @@ def swiglu_eager(x):
22
 
23
  run_benchmark(
24
  kernel_type=KernelTypeEnum.ACTIVATION,
25
- impl_name="torch_eager",
26
- impl_tags={"family":"hf-kernels", "backend":"eager"},
27
  impl_func=swiglu_eager,
28
  )
 
22
 
23
  run_benchmark(
24
  kernel_type=KernelTypeEnum.ACTIVATION,
25
+ impl_name="torch_eager_darwin",
26
+ impl_tags={"family":"pytorch", "backend":"eager", "platform": "darwin"},
27
  impl_func=swiglu_eager,
28
  )
activation/impls/cells/sysinfo.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "torch==2.8.0",
5
+ # ]
6
+ # ///
7
+ import platform
8
+ import subprocess
9
+ print(f"Platform: {platform.system()} {platform.machine()}")
10
+ print(f"Python: {platform.python_version()}")
11
+ # Check for MPS availability
12
+ import torch
13
+ print(f"PyTorch: {torch.__version__}")
14
+ print(f"MPS available: {torch.backends.mps.is_available()}")
activation/impls/hf_kernels_swiglu.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.29s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,7 +3905,7 @@ Cell: nv | 0.29s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:54:13 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.29s
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 35C P0 120W / 350W | 0MiB / 46068MiB | 100% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.29s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 8.35s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3995,16 +3995,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D768
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 85.600us 2073.64% 85.600us 85.600us 1
3999
- hf_kernels_swiglu 8.76% 183.666us 99.29% 2.081ms 2.081ms 0.000us 0.00% 5.568us 5.568us 1
4000
- _activation_23bf3fb::silu_and_mul 0.98% 20.570us 88.50% 1.855ms 618.341us 4.128us 100.00% 5.568us 1.856us 3
4001
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.128us 100.00% 4.128us 1.376us 3
4002
- Activity Buffer Request 85.39% 1.790ms 85.39% 1.790ms 1.790ms 1.440us 34.88% 1.440us 1.440us 1
4003
- aten::empty 2.03% 42.471us 2.03% 42.471us 14.157us 0.000us 0.00% 0.000us 0.000us 3
4004
- cudaLaunchKernel 2.13% 44.611us 2.13% 44.611us 14.870us 0.000us 0.00% 0.000us 0.000us 3
4005
- cudaDeviceSynchronize 0.71% 14.820us 0.71% 14.820us 14.820us 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
- Self CPU time total: 2.096ms
4008
  Self CUDA time total: 4.128us
4009
 
4010
 
@@ -4015,17 +4015,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D1024
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.111us 1666.52% 66.111us 66.111us 1
4019
- hf_kernels_swiglu 4.94% 94.004us 99.69% 1.897ms 1.897ms 0.000us 0.00% 5.311us 5.311us 1
4020
- _activation_23bf3fb::silu_and_mul 0.99% 18.841us 93.73% 1.783ms 594.417us 3.967us 100.00% 5.311us 1.770us 3
4021
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.967us 100.00% 3.967us 1.322us 3
4022
- Activity Buffer Request 91.36% 1.738ms 91.36% 1.738ms 1.738ms 1.344us 33.88% 1.344us 1.344us 1
4023
- aten::empty 1.01% 19.260us 1.01% 19.260us 6.420us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaLaunchKernel 1.38% 26.230us 1.38% 26.230us 8.743us 0.000us 0.00% 0.000us 0.000us 3
4025
- cudaDeviceSynchronize 0.31% 5.950us 0.31% 5.950us 5.950us 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
- Self CPU time total: 1.902ms
4028
- Self CUDA time total: 3.967us
4029
 
4030
 
4031
 
@@ -4035,17 +4035,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T128_D2048
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 68.479us 1380.35% 68.479us 68.479us 1
4039
- hf_kernels_swiglu 4.69% 88.684us 99.71% 1.886ms 1.886ms 0.000us 0.00% 6.625us 6.625us 1
4040
- _activation_23bf3fb::silu_and_mul 0.99% 18.661us 94.04% 1.778ms 592.827us 4.961us 100.00% 6.625us 2.208us 3
4041
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.961us 100.00% 4.961us 1.654us 3
4042
- Activity Buffer Request 91.53% 1.731ms 91.53% 1.731ms 1.731ms 1.664us 33.54% 1.664us 1.664us 1
4043
- aten::empty 0.98% 18.610us 0.98% 18.610us 6.203us 0.000us 0.00% 0.000us 0.000us 3
4044
- cudaLaunchKernel 1.52% 28.800us 1.52% 28.800us 9.600us 0.000us 0.00% 0.000us 0.000us 3
4045
- cudaDeviceSynchronize 0.29% 5.500us 0.29% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
- Self CPU time total: 1.891ms
4048
- Self CUDA time total: 4.961us
4049
 
4050
 
4051
 
@@ -4055,17 +4055,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D768
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.368us 1547.76% 66.368us 66.368us 1
4059
- hf_kernels_swiglu 4.25% 87.402us 99.76% 2.051ms 2.051ms 0.000us 0.00% 5.760us 5.760us 1
4060
- _activation_23bf3fb::silu_and_mul 0.97% 19.981us 94.58% 1.945ms 648.228us 4.288us 100.00% 5.760us 1.920us 3
4061
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.288us 100.00% 4.288us 1.429us 3
4062
- Activity Buffer Request 83.83% 1.724ms 83.83% 1.724ms 1.724ms 1.472us 34.33% 1.472us 1.472us 1
4063
- aten::empty 0.93% 19.111us 0.93% 19.111us 6.370us 0.000us 0.00% 0.000us 0.000us 3
4064
- cudaLaunchKernel 9.77% 200.885us 9.77% 200.885us 66.962us 0.000us 0.00% 0.000us 0.000us 3
4065
- cudaDeviceSynchronize 0.24% 5.020us 0.24% 5.020us 5.020us 0.000us 0.00% 0.000us 0.000us 1
4066
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4067
- Self CPU time total: 2.056ms
4068
- Self CUDA time total: 4.288us
4069
 
4070
 
4071
 
@@ -4075,17 +4075,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D1024
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 67.360us 1131.72% 67.360us 67.360us 1
4079
- hf_kernels_swiglu 4.31% 89.293us 99.77% 2.067ms 2.067ms 0.000us 0.00% 7.968us 7.968us 1
4080
- _activation_23bf3fb::silu_and_mul 0.98% 20.220us 94.55% 1.959ms 652.859us 5.952us 100.00% 7.968us 2.656us 3
4081
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.952us 100.00% 5.952us 1.984us 3
4082
- Activity Buffer Request 85.78% 1.777ms 85.78% 1.777ms 1.777ms 2.016us 33.87% 2.016us 2.016us 1
4083
- aten::empty 0.91% 18.861us 0.91% 18.861us 6.287us 0.000us 0.00% 0.000us 0.000us 3
4084
- cudaLaunchKernel 7.79% 161.464us 7.79% 161.464us 53.821us 0.000us 0.00% 0.000us 0.000us 3
4085
- cudaDeviceSynchronize 0.23% 4.820us 0.23% 4.820us 4.820us 0.000us 0.00% 0.000us 0.000us 1
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
- Self CPU time total: 2.072ms
4088
- Self CUDA time total: 5.952us
4089
 
4090
 
4091
 
@@ -4095,17 +4095,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T256_D2048
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.574us 830.43% 64.574us 64.574us 1
4099
- hf_kernels_swiglu 18.42% 86.111us 98.86% 462.073us 462.073us 0.000us 0.00% 10.367us 10.367us 1
4100
- _activation_23bf3fb::silu_and_mul 4.27% 19.980us 76.48% 357.451us 119.150us 7.776us 100.00% 10.367us 3.456us 3
4101
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.776us 100.00% 7.776us 2.592us 3
4102
- Activity Buffer Request 38.90% 181.805us 38.90% 181.805us 181.805us 2.591us 33.32% 2.591us 2.591us 1
4103
- aten::empty 3.96% 18.511us 3.96% 18.511us 6.170us 0.000us 0.00% 0.000us 0.000us 3
4104
- cudaLaunchKernel 33.30% 155.666us 33.30% 155.666us 51.889us 0.000us 0.00% 0.000us 0.000us 3
4105
- cudaDeviceSynchronize 1.14% 5.330us 1.14% 5.330us 5.330us 0.000us 0.00% 0.000us 0.000us 1
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
- Self CPU time total: 467.403us
4108
- Self CUDA time total: 7.776us
4109
 
4110
 
4111
 
@@ -4115,16 +4115,16 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D768
4115
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4116
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.527us 943.95% 62.527us 62.527us 1
4119
- hf_kernels_swiglu 18.86% 83.092us 98.85% 435.523us 435.523us 0.000us 0.00% 8.832us 8.832us 1
4120
- _activation_23bf3fb::silu_and_mul 4.63% 20.380us 75.83% 334.080us 111.360us 6.624us 100.00% 8.832us 2.944us 3
4121
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 100.00% 6.624us 2.208us 3
4122
- Activity Buffer Request 36.44% 160.555us 36.44% 160.555us 160.555us 2.208us 33.33% 2.208us 2.208us 1
4123
- aten::empty 4.17% 18.351us 4.17% 18.351us 6.117us 0.000us 0.00% 0.000us 0.000us 3
4124
- cudaLaunchKernel 34.76% 153.145us 34.76% 153.145us 51.048us 0.000us 0.00% 0.000us 0.000us 3
4125
- cudaDeviceSynchronize 1.15% 5.060us 1.15% 5.060us 5.060us 0.000us 0.00% 0.000us 0.000us 1
4126
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4127
- Self CPU time total: 440.583us
4128
  Self CUDA time total: 6.624us
4129
 
4130
 
@@ -4135,17 +4135,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D1024
4135
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4136
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4137
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4138
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 69.184us 732.88% 69.184us 69.184us 1
4139
- hf_kernels_swiglu 4.54% 90.562us 99.76% 1.988ms 1.988ms 0.000us 0.00% 12.608us 12.608us 1
4140
- _activation_23bf3fb::silu_and_mul 1.02% 20.260us 94.19% 1.877ms 625.705us 9.440us 100.00% 12.608us 4.203us 3
4141
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.440us 100.00% 9.440us 3.147us 3
4142
- Activity Buffer Request 85.41% 1.702ms 85.41% 1.702ms 1.702ms 3.168us 33.56% 3.168us 3.168us 1
4143
- aten::empty 1.03% 20.450us 1.03% 20.450us 6.817us 0.000us 0.00% 0.000us 0.000us 3
4144
- cudaLaunchKernel 7.76% 154.666us 7.76% 154.666us 51.555us 0.000us 0.00% 0.000us 0.000us 3
4145
- cudaDeviceSynchronize 0.24% 4.870us 0.24% 4.870us 4.870us 0.000us 0.00% 0.000us 0.000us 1
4146
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4147
- Self CPU time total: 1.993ms
4148
- Self CUDA time total: 9.440us
4149
 
4150
 
4151
 
@@ -4155,17 +4155,17 @@ PROFILE TRACE: hf_kernels_swiglu | cuda_T512_D2048
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4157
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4158
- hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 65.376us 499.51% 65.376us 65.376us 1
4159
- hf_kernels_swiglu 19.52% 83.334us 98.75% 421.512us 421.512us 0.000us 0.00% 17.472us 17.472us 1
4160
- _activation_23bf3fb::silu_and_mul 4.53% 19.340us 74.78% 319.198us 106.399us 13.088us 100.00% 17.472us 5.824us 3
4161
- void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.088us 100.00% 13.088us 4.363us 3
4162
- Activity Buffer Request 34.31% 146.444us 34.31% 146.444us 146.444us 4.384us 33.50% 4.384us 4.384us 1
4163
- aten::empty 4.45% 18.980us 4.45% 18.980us 6.327us 0.000us 0.00% 0.000us 0.000us 3
4164
- cudaLaunchKernel 35.94% 153.414us 35.94% 153.414us 51.138us 0.000us 0.00% 0.000us 0.000us 3
4165
- cudaDeviceSynchronize 1.25% 5.350us 1.25% 5.350us 5.350us 0.000us 0.00% 0.000us 0.000us 1
4166
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4167
- Self CPU time total: 426.862us
4168
- Self CUDA time total: 13.088us
4169
 
4170
 
4171
  impl wl p50(ms) ok
@@ -4182,13 +4182,14 @@ hf_kernels_swiglu cuda_T512_D768 0.03 True
4182
  <div class="uv-install-logs" id="uv-logs-benchmark">
4183
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4184
  <div class="uv-logs-content" style="display: none;">
4185
- Installed 51 packages in 320ms
4186
  </div>
4187
  </div>
4188
- <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4189
- Fetching 7 files: 14%|█▍ | 1/7 [00:00&lt;00:01, 5.80it/s]
4190
- Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 13.68it/s]
4191
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 17.69it/s]</div>
 
4192
  <div class="cell-artifacts">
4193
  <h4>Artifacts:</h4>
4194
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:01:11 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
 
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 39C P0 82W / 350W | 0MiB / 46068MiB | 10% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 8.49s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3995
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3996
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3997
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3998
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 76.129us 1844.21% 76.129us 76.129us 1
3999
+ hf_kernels_swiglu 8.60% 174.603us 99.27% 2.015ms 2.015ms 0.000us 0.00% 5.568us 5.568us 1
4000
+ _activation_23bf3fb::silu_and_mul 0.97% 19.670us 88.54% 1.797ms 599.020us 4.128us 100.00% 5.568us 1.856us 3
4001
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.128us 100.00% 4.128us 1.376us 3
4002
+ Activity Buffer Request 85.37% 1.733ms 85.37% 1.733ms 1.733ms 1.440us 34.88% 1.440us 1.440us 1
4003
+ aten::empty 2.13% 43.191us 2.13% 43.191us 14.397us 0.000us 0.00% 0.000us 0.000us 3
4004
+ cudaLaunchKernel 2.20% 44.752us 2.20% 44.752us 14.917us 0.000us 0.00% 0.000us 0.000us 3
4005
+ cudaDeviceSynchronize 0.73% 14.741us 0.73% 14.741us 14.741us 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ Self CPU time total: 2.030ms
4008
  Self CUDA time total: 4.128us
4009
 
4010
 
 
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.783us 1582.23% 62.783us 62.783us 1
4019
+ hf_kernels_swiglu 4.95% 92.601us 99.70% 1.863ms 1.863ms 0.000us 0.00% 5.312us 5.312us 1
4020
+ _activation_23bf3fb::silu_and_mul 1.25% 23.392us 93.77% 1.753ms 584.220us 3.968us 100.00% 5.312us 1.771us 3
4021
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 3.968us 100.00% 3.968us 1.323us 3
4022
+ Activity Buffer Request 91.17% 1.704ms 91.17% 1.704ms 1.704ms 1.344us 33.87% 1.344us 1.344us 1
4023
+ aten::empty 0.97% 18.160us 0.97% 18.160us 6.053us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaLaunchKernel 1.35% 25.221us 1.35% 25.221us 8.407us 0.000us 0.00% 0.000us 0.000us 3
4025
+ cudaDeviceSynchronize 0.30% 5.620us 0.30% 5.620us 5.620us 0.000us 0.00% 0.000us 0.000us 1
4026
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4027
+ Self CPU time total: 1.869ms
4028
+ Self CUDA time total: 3.968us
4029
 
4030
 
4031
 
 
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4037
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4038
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 61.887us 1264.03% 61.887us 61.887us 1
4039
+ hf_kernels_swiglu 4.90% 91.392us 99.70% 1.861ms 1.861ms 0.000us 0.00% 6.528us 6.528us 1
4040
+ _activation_23bf3fb::silu_and_mul 1.06% 19.772us 93.81% 1.751ms 583.690us 4.896us 100.00% 6.528us 2.176us 3
4041
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.896us 100.00% 4.896us 1.632us 3
4042
+ Activity Buffer Request 91.42% 1.706ms 91.42% 1.706ms 1.706ms 1.632us 33.33% 1.632us 1.632us 1
4043
+ aten::empty 1.00% 18.580us 1.00% 18.580us 6.193us 0.000us 0.00% 0.000us 0.000us 3
4044
+ cudaLaunchKernel 1.33% 24.870us 1.33% 24.870us 8.290us 0.000us 0.00% 0.000us 0.000us 3
4045
+ cudaDeviceSynchronize 0.30% 5.640us 0.30% 5.640us 5.640us 0.000us 0.00% 0.000us 0.000us 1
4046
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4047
+ Self CPU time total: 1.867ms
4048
+ Self CUDA time total: 4.896us
4049
 
4050
 
4051
 
 
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 66.431us 1560.88% 66.431us 66.431us 1
4059
+ hf_kernels_swiglu 4.62% 96.552us 99.72% 2.084ms 2.084ms 0.000us 0.00% 5.696us 5.696us 1
4060
+ _activation_23bf3fb::silu_and_mul 0.92% 19.230us 94.20% 1.969ms 656.267us 4.256us 100.00% 5.696us 1.899us 3
4061
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 4.256us 100.00% 4.256us 1.419us 3
4062
+ Activity Buffer Request 82.63% 1.727ms 82.63% 1.727ms 1.727ms 1.440us 33.83% 1.440us 1.440us 1
4063
+ aten::empty 0.91% 18.961us 0.91% 18.961us 6.320us 0.000us 0.00% 0.000us 0.000us 3
4064
+ cudaLaunchKernel 10.64% 222.454us 10.64% 222.454us 74.151us 0.000us 0.00% 0.000us 0.000us 3
4065
+ cudaDeviceSynchronize 0.28% 5.800us 0.28% 5.800us 5.800us 0.000us 0.00% 0.000us 0.000us 1
4066
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4067
+ Self CPU time total: 2.090ms
4068
+ Self CUDA time total: 4.256us
4069
 
4070
 
4071
 
 
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 62.753us 1065.60% 62.753us 62.753us 1
4079
+ hf_kernels_swiglu 4.32% 90.233us 99.73% 2.084ms 2.084ms 0.000us 0.00% 7.842us 7.842us 1
4080
+ _activation_23bf3fb::silu_and_mul 0.98% 20.530us 94.51% 1.975ms 658.421us 5.889us 100.00% 7.842us 2.614us 3
4081
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 5.889us 100.00% 5.889us 1.963us 3
4082
+ Activity Buffer Request 83.43% 1.744ms 83.43% 1.744ms 1.744ms 1.953us 33.16% 1.953us 1.953us 1
4083
+ aten::empty 0.90% 18.820us 0.90% 18.820us 6.273us 0.000us 0.00% 0.000us 0.000us 3
4084
+ cudaLaunchKernel 10.09% 210.974us 10.09% 210.974us 70.325us 0.000us 0.00% 0.000us 0.000us 3
4085
+ cudaDeviceSynchronize 0.27% 5.680us 0.27% 5.680us 5.680us 0.000us 0.00% 0.000us 0.000us 1
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
+ Self CPU time total: 2.090ms
4088
+ Self CUDA time total: 5.889us
4089
 
4090
 
4091
 
 
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 58.974us 761.74% 58.974us 58.974us 1
4099
+ hf_kernels_swiglu 14.39% 83.563us 99.11% 575.543us 575.543us 0.000us 0.00% 10.333us 10.333us 1
4100
+ _activation_23bf3fb::silu_and_mul 3.37% 19.590us 81.67% 474.270us 158.090us 7.742us 100.00% 10.333us 3.444us 3
4101
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 7.742us 100.00% 7.742us 2.581us 3
4102
+ Activity Buffer Request 43.30% 251.476us 43.30% 251.476us 251.476us 2.591us 33.47% 2.591us 2.591us 1
4103
+ aten::empty 3.05% 17.710us 3.05% 17.710us 5.903us 0.000us 0.00% 0.000us 0.000us 3
4104
+ cudaLaunchKernel 34.99% 203.204us 34.99% 203.204us 67.735us 0.000us 0.00% 0.000us 0.000us 3
4105
+ cudaDeviceSynchronize 0.89% 5.190us 0.89% 5.190us 5.190us 0.000us 0.00% 0.000us 0.000us 1
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
+ Self CPU time total: 580.733us
4108
+ Self CUDA time total: 7.742us
4109
 
4110
 
4111
 
 
4115
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4116
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.191us 908.68% 60.191us 60.191us 1
4119
+ hf_kernels_swiglu 14.49% 83.902us 99.19% 574.293us 574.293us 0.000us 0.00% 8.832us 8.832us 1
4120
+ _activation_23bf3fb::silu_and_mul 3.38% 19.561us 81.54% 472.101us 157.367us 6.624us 100.00% 8.832us 2.944us 3
4121
  void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 6.624us 100.00% 6.624us 2.208us 3
4122
+ Activity Buffer Request 43.39% 251.205us 43.39% 251.205us 251.205us 2.208us 33.33% 2.208us 2.208us 1
4123
+ aten::empty 3.16% 18.290us 3.16% 18.290us 6.097us 0.000us 0.00% 0.000us 0.000us 3
4124
+ cudaLaunchKernel 34.77% 201.335us 34.77% 201.335us 67.112us 0.000us 0.00% 0.000us 0.000us 3
4125
+ cudaDeviceSynchronize 0.81% 4.680us 0.81% 4.680us 4.680us 0.000us 0.00% 0.000us 0.000us 1
4126
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4127
+ Self CPU time total: 578.973us
4128
  Self CUDA time total: 6.624us
4129
 
4130
 
 
4135
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4136
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4137
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4138
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 64.480us 685.45% 64.480us 64.480us 1
4139
+ hf_kernels_swiglu 4.47% 90.662us 99.76% 2.023ms 2.023ms 0.000us 0.00% 12.543us 12.543us 1
4140
+ _activation_23bf3fb::silu_and_mul 0.98% 19.960us 94.38% 1.913ms 637.817us 9.407us 100.00% 12.543us 4.181us 3
4141
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 9.407us 100.00% 9.407us 3.136us 3
4142
+ Activity Buffer Request 83.63% 1.695ms 83.63% 1.695ms 1.695ms 3.136us 33.34% 3.136us 3.136us 1
4143
+ aten::empty 0.91% 18.421us 0.91% 18.421us 6.140us 0.000us 0.00% 0.000us 0.000us 3
4144
+ cudaLaunchKernel 9.77% 198.004us 9.77% 198.004us 66.001us 0.000us 0.00% 0.000us 0.000us 3
4145
+ cudaDeviceSynchronize 0.24% 4.950us 0.24% 4.950us 4.950us 0.000us 0.00% 0.000us 0.000us 1
4146
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4147
+ Self CPU time total: 2.027ms
4148
+ Self CUDA time total: 9.407us
4149
 
4150
 
4151
 
 
4155
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4156
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4157
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4158
+ hf_kernels_swiglu 0.00% 0.000us 0.00% 0.000us 0.000us 60.576us 465.11% 60.576us 60.576us 1
4159
+ hf_kernels_swiglu 15.18% 83.082us 99.12% 542.352us 542.352us 0.000us 0.00% 17.408us 17.408us 1
4160
+ _activation_23bf3fb::silu_and_mul 3.66% 20.041us 80.66% 441.340us 147.113us 13.024us 100.00% 17.408us 5.803us 3
4161
+ void vllm::act_and_mul_kernel&lt;c10::BFloat16, &amp;(c10::... 0.00% 0.000us 0.00% 0.000us 0.000us 13.024us 100.00% 13.024us 4.341us 3
4162
+ Activity Buffer Request 41.24% 225.625us 41.24% 225.625us 225.625us 4.384us 33.66% 4.384us 4.384us 1
4163
+ aten::empty 3.28% 17.930us 3.28% 17.930us 5.977us 0.000us 0.00% 0.000us 0.000us 3
4164
+ cudaLaunchKernel 35.76% 195.674us 35.76% 195.674us 65.225us 0.000us 0.00% 0.000us 0.000us 3
4165
+ cudaDeviceSynchronize 0.88% 4.811us 0.88% 4.811us 4.811us 0.000us 0.00% 0.000us 0.000us 1
4166
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4167
+ Self CPU time total: 547.163us
4168
+ Self CUDA time total: 13.024us
4169
 
4170
 
4171
  impl wl p50(ms) ok
 
4182
  <div class="uv-install-logs" id="uv-logs-benchmark">
4183
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4184
  <div class="uv-logs-content" style="display: none;">
4185
+ Installed 51 packages in 306ms
4186
  </div>
4187
  </div>
4188
+ <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4189
+
4190
+ Fetching 7 files: 29%|██▊ | 2/7 [00:00&lt;00:00, 17.51it/s]
4191
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 14.39it/s]
4192
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 20.57it/s]</div>
4193
  <div class="cell-artifacts">
4194
  <h4>Artifacts:</h4>
4195
  <a href="artifacts/benchmark/activation.jsonl" class="artifact" target="_blank">activation.jsonl</a>
activation/impls/index.html CHANGED
@@ -82,8 +82,7 @@
82
  </div>
83
  <h1>Index of /activation/impls</h1>
84
  <ul>
85
- <li><a href='hf_kernels_swiglu.html' class='file'>hf_kernels_swiglu.html</a></li>
86
- <li><a href='torch_swiglu.html' class='file'>torch_swiglu.html</a></li>
87
  </ul>
88
  </body>
89
  </html>
 
82
  </div>
83
  <h1>Index of /activation/impls</h1>
84
  <ul>
85
+ <li><a href='torch_swiglu_darwin.html' class='file'>torch_swiglu_darwin.html</a></li>
 
86
  </ul>
87
  </body>
88
  </html>
activation/impls/torch_swiglu.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.29s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,7 +3904,7 @@ Cell: nv | 0.29s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:54:13 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.29s
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 35C P0 120W / 350W | 0MiB / 46068MiB | 100% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.29s
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 3.69s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3987,20 +3987,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D768
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 187.393us 1471.25% 187.393us 187.393us 1
3991
- torch_eager 9.10% 197.603us 99.31% 2.157ms 2.157ms 0.000us 0.00% 15.073us 15.073us 1
3992
- aten::silu 2.86% 62.203us 85.13% 1.849ms 616.358us 6.561us 51.51% 8.897us 2.966us 3
3993
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.561us 51.51% 6.561us 2.187us 3
3994
- aten::mul 1.58% 34.212us 2.55% 55.432us 18.477us 6.176us 48.49% 6.176us 2.059us 3
3995
  void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
3996
- Activity Buffer Request 80.18% 1.741ms 80.18% 1.741ms 1.741ms 2.336us 18.34% 2.336us 2.336us 1
3997
- aten::slice 2.02% 43.964us 2.53% 55.013us 9.169us 0.000us 0.00% 0.000us 0.000us 6
3998
- aten::as_strided 0.51% 11.049us 0.51% 11.049us 1.842us 0.000us 0.00% 0.000us 0.000us 6
3999
- cudaLaunchKernel 3.07% 66.630us 3.07% 66.630us 11.105us 0.000us 0.00% 0.000us 0.000us 6
4000
- cudaDeviceSynchronize 0.69% 14.920us 0.69% 14.920us 14.920us 0.000us 0.00% 0.000us 0.000us 1
4001
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4002
- Self CPU time total: 2.172ms
4003
- Self CUDA time total: 12.737us
4004
 
4005
 
4006
 
@@ -4010,20 +4010,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D1024
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.648us 1289.36% 159.648us 159.648us 1
4014
- torch_eager 6.57% 137.523us 99.70% 2.087ms 2.087ms 0.000us 0.00% 14.526us 14.526us 1
4015
- aten::silu 2.02% 42.391us 89.22% 1.868ms 622.711us 6.399us 51.68% 8.543us 2.848us 3
4016
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.399us 51.68% 6.399us 2.133us 3
4017
- aten::mul 1.43% 29.882us 2.35% 49.282us 16.427us 5.983us 48.32% 5.983us 1.994us 3
4018
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.983us 48.32% 5.983us 1.994us 3
4019
- Activity Buffer Request 85.88% 1.798ms 85.88% 1.798ms 1.798ms 2.144us 17.32% 2.144us 2.144us 1
4020
- aten::slice 1.30% 27.292us 1.55% 32.512us 5.419us 0.000us 0.00% 0.000us 0.000us 6
4021
- aten::as_strided 0.25% 5.220us 0.25% 5.220us 0.870us 0.000us 0.00% 0.000us 0.000us 6
4022
- cudaLaunchKernel 2.25% 47.061us 2.25% 47.061us 7.843us 0.000us 0.00% 0.000us 0.000us 6
4023
- cudaDeviceSynchronize 0.30% 6.330us 0.30% 6.330us 6.330us 0.000us 0.00% 0.000us 0.000us 1
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
- Self CPU time total: 2.094ms
4026
- Self CUDA time total: 12.382us
4027
 
4028
 
4029
 
@@ -4033,20 +4033,20 @@ PROFILE TRACE: torch_eager | cuda_T128_D2048
4033
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4034
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.680us 1172.29% 155.680us 155.680us 1
4037
- torch_eager 6.64% 129.562us 99.68% 1.946ms 1.946ms 0.000us 0.00% 15.584us 15.584us 1
4038
- aten::silu 2.16% 42.182us 89.10% 1.739ms 579.704us 6.848us 51.57% 9.152us 3.051us 3
4039
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.848us 51.57% 6.848us 2.283us 3
4040
- aten::mul 1.46% 28.592us 2.38% 46.553us 15.518us 6.432us 48.43% 6.432us 2.144us 3
4041
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.432us 48.43% 6.432us 2.144us 3
4042
- Activity Buffer Request 85.60% 1.671ms 85.60% 1.671ms 1.671ms 2.304us 17.35% 2.304us 2.304us 1
4043
- aten::slice 1.31% 25.640us 1.56% 30.540us 5.090us 0.000us 0.00% 0.000us 0.000us 6
4044
- aten::as_strided 0.25% 4.900us 0.25% 4.900us 0.817us 0.000us 0.00% 0.000us 0.000us 6
4045
- cudaLaunchKernel 2.26% 44.052us 2.26% 44.052us 7.342us 0.000us 0.00% 0.000us 0.000us 6
4046
- cudaDeviceSynchronize 0.32% 6.150us 0.32% 6.150us 6.150us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
- Self CPU time total: 1.952ms
4049
- Self CUDA time total: 13.280us
4050
 
4051
 
4052
 
@@ -4056,20 +4056,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D768
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.289us 1264.91% 160.289us 160.289us 1
4060
- torch_eager 6.06% 136.754us 99.75% 2.252ms 2.252ms 0.000us 0.00% 14.880us 14.880us 1
4061
- aten::silu 1.87% 42.159us 90.17% 2.036ms 678.503us 6.560us 51.77% 8.768us 2.923us 3
4062
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.77% 6.560us 2.187us 3
4063
- aten::mul 1.25% 28.231us 2.20% 49.632us 16.544us 6.112us 48.23% 6.112us 2.037us 3
4064
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.112us 48.23% 6.112us 2.037us 3
4065
- Activity Buffer Request 79.28% 1.790ms 79.28% 1.790ms 1.790ms 2.208us 17.42% 2.208us 2.208us 1
4066
- aten::slice 1.09% 24.671us 1.32% 29.801us 4.967us 0.000us 0.00% 0.000us 0.000us 6
4067
- aten::as_strided 0.23% 5.130us 0.23% 5.130us 0.855us 0.000us 0.00% 0.000us 0.000us 6
4068
- cudaLaunchKernel 9.98% 225.208us 9.98% 225.208us 37.535us 0.000us 0.00% 0.000us 0.000us 6
4069
- cudaDeviceSynchronize 0.25% 5.621us 0.25% 5.621us 5.621us 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
  Self CPU time total: 2.257ms
4072
- Self CUDA time total: 12.672us
4073
 
4074
 
4075
 
@@ -4079,20 +4079,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D1024
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 159.295us 1196.72% 159.295us 159.295us 1
4083
- torch_eager 6.43% 135.135us 99.75% 2.096ms 2.096ms 0.000us 0.00% 15.615us 15.615us 1
4084
- aten::silu 2.00% 41.931us 89.60% 1.883ms 627.518us 6.815us 51.20% 9.119us 3.040us 3
4085
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.815us 51.20% 6.815us 2.272us 3
4086
- aten::mul 1.42% 29.749us 2.27% 47.691us 15.897us 6.496us 48.80% 6.496us 2.165us 3
4087
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.496us 48.80% 6.496us 2.165us 3
4088
- Activity Buffer Request 79.61% 1.673ms 79.61% 1.673ms 1.673ms 2.304us 17.31% 2.304us 2.304us 1
4089
- aten::slice 1.22% 25.650us 1.46% 30.630us 5.105us 0.000us 0.00% 0.000us 0.000us 6
4090
- aten::as_strided 0.24% 4.980us 0.24% 4.980us 0.830us 0.000us 0.00% 0.000us 0.000us 6
4091
- cudaLaunchKernel 8.84% 185.847us 8.84% 185.847us 30.974us 0.000us 0.00% 0.000us 0.000us 6
4092
- cudaDeviceSynchronize 0.25% 5.161us 0.25% 5.161us 5.161us 0.000us 0.00% 0.000us 0.000us 1
4093
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4094
- Self CPU time total: 2.101ms
4095
- Self CUDA time total: 13.311us
4096
 
4097
 
4098
 
@@ -4102,20 +4102,20 @@ PROFILE TRACE: torch_eager | cuda_T256_D2048
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4104
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4105
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.432us 1014.55% 158.432us 158.432us 1
4106
- torch_eager 6.38% 140.261us 99.75% 2.192ms 2.192ms 0.000us 0.00% 18.304us 18.304us 1
4107
- aten::silu 1.93% 42.492us 89.81% 1.973ms 657.799us 8.000us 51.23% 10.688us 3.563us 3
4108
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 8.000us 51.23% 8.000us 2.667us 3
4109
- aten::mul 1.27% 27.872us 2.10% 46.122us 15.374us 7.616us 48.77% 7.616us 2.539us 3
4110
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.77% 7.616us 2.539us 3
4111
- Activity Buffer Request 80.61% 1.771ms 80.61% 1.771ms 1.771ms 2.688us 17.21% 2.688us 2.688us 1
4112
- aten::slice 1.22% 26.832us 1.46% 31.992us 5.332us 0.000us 0.00% 0.000us 0.000us 6
4113
- aten::as_strided 0.23% 5.160us 0.23% 5.160us 0.860us 0.000us 0.00% 0.000us 0.000us 6
4114
- cudaLaunchKernel 8.09% 177.845us 8.09% 177.845us 29.641us 0.000us 0.00% 0.000us 0.000us 6
4115
- cudaDeviceSynchronize 0.25% 5.530us 0.25% 5.530us 5.530us 0.000us 0.00% 0.000us 0.000us 1
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
- Self CPU time total: 2.197ms
4118
- Self CUDA time total: 15.616us
4119
 
4120
 
4121
 
@@ -4125,20 +4125,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D768
4125
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4126
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.083us 1097.65% 158.083us 158.083us 1
4129
- torch_eager 6.35% 128.334us 99.75% 2.015ms 2.015ms 0.000us 0.00% 16.898us 16.898us 1
4130
- aten::silu 2.10% 42.419us 89.46% 1.807ms 602.407us 7.394us 51.34% 9.890us 3.297us 3
4131
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.394us 51.34% 7.394us 2.465us 3
4132
- aten::mul 1.39% 28.141us 2.40% 48.382us 16.127us 7.008us 48.66% 7.008us 2.336us 3
4133
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.008us 48.66% 7.008us 2.336us 3
4134
- Activity Buffer Request 79.49% 1.606ms 79.49% 1.606ms 1.606ms 2.496us 17.33% 2.496us 2.496us 1
4135
- aten::slice 1.27% 25.691us 1.54% 31.081us 5.180us 0.000us 0.00% 0.000us 0.000us 6
4136
- aten::as_strided 0.27% 5.390us 0.27% 5.390us 0.898us 0.000us 0.00% 0.000us 0.000us 6
4137
- cudaLaunchKernel 8.88% 179.306us 8.88% 179.306us 29.884us 0.000us 0.00% 0.000us 0.000us 6
4138
- cudaDeviceSynchronize 0.25% 5.100us 0.25% 5.100us 5.100us 0.000us 0.00% 0.000us 0.000us 1
4139
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4140
- Self CPU time total: 2.020ms
4141
- Self CUDA time total: 14.402us
4142
 
4143
 
4144
 
@@ -4148,20 +4148,20 @@ PROFILE TRACE: torch_eager | cuda_T512_D1024
4148
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4149
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 158.658us 1020.18% 158.658us 158.658us 1
4152
- torch_eager 5.52% 111.823us 99.73% 2.019ms 2.019ms 0.000us 0.00% 18.240us 18.240us 1
4153
- aten::silu 2.12% 42.830us 90.25% 1.827ms 609.110us 7.936us 51.03% 10.624us 3.541us 3
4154
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.03% 7.936us 2.645us 3
4155
- aten::mul 1.37% 27.772us 2.44% 49.332us 16.444us 7.616us 48.97% 7.616us 2.539us 3
4156
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.97% 7.616us 2.539us 3
4157
- Activity Buffer Request 80.18% 1.623ms 80.18% 1.623ms 1.623ms 2.688us 17.28% 2.688us 2.688us 1
4158
- aten::slice 1.25% 25.302us 1.51% 30.641us 5.107us 0.000us 0.00% 0.000us 0.000us 6
4159
- aten::as_strided 0.26% 5.339us 0.26% 5.339us 0.890us 0.000us 0.00% 0.000us 0.000us 6
4160
- cudaLaunchKernel 9.02% 182.624us 9.02% 182.624us 30.437us 0.000us 0.00% 0.000us 0.000us 6
4161
- cudaDeviceSynchronize 0.27% 5.520us 0.27% 5.520us 5.520us 0.000us 0.00% 0.000us 0.000us 1
4162
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4163
- Self CPU time total: 2.025ms
4164
- Self CUDA time total: 15.552us
4165
 
4166
 
4167
 
@@ -4171,24 +4171,24 @@ PROFILE TRACE: torch_eager | cuda_T512_D2048
4171
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4172
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 164.002us 726.96% 164.002us 164.002us 1
4175
- torch_eager 5.39% 111.814us 99.74% 2.071ms 2.071ms 0.000us 0.00% 26.464us 26.464us 1
4176
- aten::silu 2.07% 43.010us 90.61% 1.881ms 627.114us 11.616us 51.49% 15.520us 5.173us 3
4177
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.616us 51.49% 11.616us 3.872us 3
4178
- aten::mul 1.37% 28.451us 2.32% 48.232us 16.077us 10.944us 48.51% 10.944us 3.648us 3
4179
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.944us 48.51% 10.944us 3.648us 3
4180
- Activity Buffer Request 80.76% 1.677ms 80.76% 1.677ms 1.677ms 3.904us 17.30% 3.904us 3.904us 1
4181
- aten::slice 1.14% 23.769us 1.41% 29.310us 4.885us 0.000us 0.00% 0.000us 0.000us 6
4182
- aten::as_strided 0.27% 5.541us 0.27% 5.541us 0.923us 0.000us 0.00% 0.000us 0.000us 6
4183
- cudaLaunchKernel 8.74% 181.415us 8.74% 181.415us 30.236us 0.000us 0.00% 0.000us 0.000us 6
4184
- cudaDeviceSynchronize 0.26% 5.500us 0.26% 5.500us 5.500us 0.000us 0.00% 0.000us 0.000us 1
4185
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4186
- Self CPU time total: 2.076ms
4187
- Self CUDA time total: 22.560us
4188
 
4189
 
4190
  impl wl p50(ms) ok
4191
- torch_eager cuda_T128_D1024 0.06 True
4192
  torch_eager cuda_T128_D2048 0.05 True
4193
  torch_eager cuda_T128_D768 0.04 True
4194
  torch_eager cuda_T256_D1024 0.05 True
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:01:11 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
 
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 39C P0 82W / 350W | 0MiB / 46068MiB | 10% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 3.88s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 176.512us 1385.93% 176.512us 176.512us 1
3991
+ torch_eager 8.54% 185.335us 99.30% 2.155ms 2.155ms 0.000us 0.00% 15.072us 15.072us 1
3992
+ aten::silu 2.61% 56.610us 85.90% 1.864ms 621.400us 6.560us 51.51% 8.896us 2.965us 3
3993
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.560us 51.51% 6.560us 2.187us 3
3994
+ aten::mul 1.46% 31.580us 2.49% 54.091us 18.030us 6.176us 48.49% 6.176us 2.059us 3
3995
  void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.176us 48.49% 6.176us 2.059us 3
3996
+ Activity Buffer Request 81.25% 1.763ms 81.25% 1.763ms 1.763ms 2.336us 18.34% 2.336us 2.336us 1
3997
+ aten::slice 1.93% 41.799us 2.37% 51.470us 8.578us 0.000us 0.00% 0.000us 0.000us 6
3998
+ aten::as_strided 0.45% 9.671us 0.45% 9.671us 1.612us 0.000us 0.00% 0.000us 0.000us 6
3999
+ cudaLaunchKernel 3.08% 66.742us 3.08% 66.742us 11.124us 0.000us 0.00% 0.000us 0.000us 6
4000
+ cudaDeviceSynchronize 0.70% 15.141us 0.70% 15.141us 15.141us 0.000us 0.00% 0.000us 0.000us 1
4001
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4002
+ Self CPU time total: 2.170ms
4003
+ Self CUDA time total: 12.736us
4004
 
4005
 
4006
 
 
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 147.103us 1187.66% 147.103us 147.103us 1
4014
+ torch_eager 6.42% 134.022us 99.73% 2.081ms 2.081ms 0.000us 0.00% 14.563us 14.563us 1
4015
+ aten::silu 1.84% 38.392us 89.66% 1.871ms 623.681us 6.401us 51.68% 8.578us 2.859us 3
4016
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.401us 51.68% 6.401us 2.134us 3
4017
+ aten::mul 1.30% 27.120us 2.25% 46.940us 15.647us 5.985us 48.32% 5.985us 1.995us 3
4018
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 5.985us 48.32% 5.985us 1.995us 3
4019
+ Activity Buffer Request 86.49% 1.805ms 86.49% 1.805ms 1.805ms 2.177us 17.58% 2.177us 2.177us 1
4020
+ aten::slice 1.12% 23.282us 1.39% 29.102us 4.850us 0.000us 0.00% 0.000us 0.000us 6
4021
+ aten::as_strided 0.28% 5.820us 0.28% 5.820us 0.970us 0.000us 0.00% 0.000us 0.000us 6
4022
+ cudaLaunchKernel 2.28% 47.661us 2.28% 47.661us 7.944us 0.000us 0.00% 0.000us 0.000us 6
4023
+ cudaDeviceSynchronize 0.27% 5.671us 0.27% 5.671us 5.671us 0.000us 0.00% 0.000us 0.000us 1
4024
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4025
+ Self CPU time total: 2.087ms
4026
+ Self CUDA time total: 12.386us
4027
 
4028
 
4029
 
 
4033
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4034
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4035
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4036
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 145.888us 1106.55% 145.888us 145.888us 1
4037
+ torch_eager 6.31% 124.322us 99.70% 1.963ms 1.963ms 0.000us 0.00% 15.456us 15.456us 1
4038
+ aten::silu 2.05% 40.451us 89.58% 1.764ms 587.980us 6.784us 51.46% 9.056us 3.019us 3
4039
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.784us 51.46% 6.784us 2.261us 3
4040
+ aten::mul 1.27% 25.091us 2.33% 45.941us 15.314us 6.400us 48.54% 6.400us 2.133us 3
4041
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 48.54% 6.400us 2.133us 3
4042
+ Activity Buffer Request 86.22% 1.698ms 86.22% 1.698ms 1.698ms 2.272us 17.23% 2.272us 2.272us 1
4043
+ aten::slice 1.19% 23.361us 1.47% 29.031us 4.839us 0.000us 0.00% 0.000us 0.000us 6
4044
+ aten::as_strided 0.29% 5.670us 0.29% 5.670us 0.945us 0.000us 0.00% 0.000us 0.000us 6
4045
+ cudaLaunchKernel 2.36% 46.481us 2.36% 46.481us 7.747us 0.000us 0.00% 0.000us 0.000us 6
4046
+ cudaDeviceSynchronize 0.30% 5.880us 0.30% 5.880us 5.880us 0.000us 0.00% 0.000us 0.000us 1
4047
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4048
+ Self CPU time total: 1.969ms
4049
+ Self CUDA time total: 13.184us
4050
 
4051
 
4052
 
 
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4058
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4059
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.959us 1244.82% 156.959us 156.959us 1
4060
+ torch_eager 5.42% 122.252us 99.77% 2.252ms 2.252ms 0.000us 0.00% 14.785us 14.785us 1
4061
+ aten::silu 1.78% 40.211us 90.81% 2.050ms 683.202us 6.497us 51.53% 8.673us 2.891us 3
4062
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.497us 51.53% 6.497us 2.166us 3
4063
+ aten::mul 1.27% 28.640us 2.19% 49.471us 16.490us 6.112us 48.47% 6.112us 2.037us 3
4064
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.112us 48.47% 6.112us 2.037us 3
4065
+ Activity Buffer Request 80.04% 1.807ms 80.04% 1.807ms 1.807ms 2.176us 17.26% 2.176us 2.176us 1
4066
+ aten::slice 1.10% 24.730us 1.36% 30.660us 5.110us 0.000us 0.00% 0.000us 0.000us 6
4067
+ aten::as_strided 0.26% 5.930us 0.26% 5.930us 0.988us 0.000us 0.00% 0.000us 0.000us 6
4068
+ cudaLaunchKernel 9.91% 223.637us 9.91% 223.637us 37.273us 0.000us 0.00% 0.000us 0.000us 6
4069
+ cudaDeviceSynchronize 0.23% 5.130us 0.23% 5.130us 5.130us 0.000us 0.00% 0.000us 0.000us 1
4070
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4071
  Self CPU time total: 2.257ms
4072
+ Self CUDA time total: 12.609us
4073
 
4074
 
4075
 
 
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4081
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4082
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.703us 1215.97% 160.703us 160.703us 1
4083
+ torch_eager 6.46% 135.762us 99.74% 2.098ms 2.098ms 0.000us 0.00% 15.488us 15.488us 1
4084
+ aten::silu 1.92% 40.421us 89.37% 1.880ms 626.541us 6.816us 51.57% 9.088us 3.029us 3
4085
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.816us 51.57% 6.816us 2.272us 3
4086
+ aten::mul 1.37% 28.851us 2.33% 49.101us 16.367us 6.400us 48.43% 6.400us 2.133us 3
4087
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.400us 48.43% 6.400us 2.133us 3
4088
+ Activity Buffer Request 79.67% 1.676ms 79.67% 1.676ms 1.676ms 2.272us 17.19% 2.272us 2.272us 1
4089
+ aten::slice 1.24% 26.071us 1.57% 33.081us 5.513us 0.000us 0.00% 0.000us 0.000us 6
4090
+ aten::as_strided 0.33% 7.010us 0.33% 7.010us 1.168us 0.000us 0.00% 0.000us 0.000us 6
4091
+ cudaLaunchKernel 8.75% 183.945us 8.75% 183.945us 30.657us 0.000us 0.00% 0.000us 0.000us 6
4092
+ cudaDeviceSynchronize 0.26% 5.530us 0.26% 5.530us 5.530us 0.000us 0.00% 0.000us 0.000us 1
4093
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4094
+ Self CPU time total: 2.103ms
4095
+ Self CUDA time total: 13.216us
4096
 
4097
 
4098
 
 
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4104
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4105
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 160.606us 1034.83% 160.606us 160.606us 1
4106
+ torch_eager 5.99% 133.963us 99.76% 2.233ms 2.233ms 0.000us 0.00% 18.208us 18.208us 1
4107
+ aten::silu 1.79% 40.170us 90.10% 2.017ms 672.181us 7.936us 51.13% 10.624us 3.541us 3
4108
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.936us 51.13% 7.936us 2.645us 3
4109
+ aten::mul 1.29% 28.971us 2.18% 48.701us 16.234us 7.584us 48.87% 7.584us 2.528us 3
4110
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.584us 48.87% 7.584us 2.528us 3
4111
+ Activity Buffer Request 81.23% 1.818ms 81.23% 1.818ms 1.818ms 2.688us 17.32% 2.688us 2.688us 1
4112
+ aten::slice 1.18% 26.440us 1.50% 33.480us 5.580us 0.000us 0.00% 0.000us 0.000us 6
4113
+ aten::as_strided 0.31% 7.040us 0.31% 7.040us 1.173us 0.000us 0.00% 0.000us 0.000us 6
4114
+ cudaLaunchKernel 7.96% 178.055us 7.96% 178.055us 29.676us 0.000us 0.00% 0.000us 0.000us 6
4115
+ cudaDeviceSynchronize 0.24% 5.430us 0.24% 5.430us 5.430us 0.000us 0.00% 0.000us 0.000us 1
4116
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4117
+ Self CPU time total: 2.238ms
4118
+ Self CUDA time total: 15.520us
4119
 
4120
 
4121
 
 
4125
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4126
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 161.343us 1127.96% 161.343us 161.343us 1
4129
+ torch_eager 6.15% 126.753us 99.76% 2.055ms 2.055ms 0.000us 0.00% 16.768us 16.768us 1
4130
+ aten::silu 2.04% 42.050us 89.57% 1.845ms 614.923us 7.328us 51.23% 9.792us 3.264us 3
4131
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.328us 51.23% 7.328us 2.443us 3
4132
+ aten::mul 1.44% 29.680us 2.44% 50.310us 16.770us 6.976us 48.77% 6.976us 2.325us 3
4133
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 6.976us 48.77% 6.976us 2.325us 3
4134
+ Activity Buffer Request 78.32% 1.613ms 78.32% 1.613ms 1.613ms 2.464us 17.23% 2.464us 2.464us 1
4135
+ aten::slice 1.25% 25.802us 1.59% 32.722us 5.454us 0.000us 0.00% 0.000us 0.000us 6
4136
+ aten::as_strided 0.34% 6.920us 0.34% 6.920us 1.153us 0.000us 0.00% 0.000us 0.000us 6
4137
+ cudaLaunchKernel 10.21% 210.375us 10.21% 210.375us 35.062us 0.000us 0.00% 0.000us 0.000us 6
4138
+ cudaDeviceSynchronize 0.24% 4.981us 0.24% 4.981us 4.981us 0.000us 0.00% 0.000us 0.000us 1
4139
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4140
+ Self CPU time total: 2.060ms
4141
+ Self CUDA time total: 14.304us
4142
 
4143
 
4144
 
 
4148
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4149
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 155.936us 1000.62% 155.936us 155.936us 1
4152
+ torch_eager 5.31% 107.073us 99.73% 2.011ms 2.011ms 0.000us 0.00% 18.272us 18.272us 1
4153
+ aten::silu 1.95% 39.312us 90.55% 1.825ms 608.464us 7.968us 51.13% 10.656us 3.552us 3
4154
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.968us 51.13% 7.968us 2.656us 3
4155
+ aten::mul 1.40% 28.240us 2.34% 47.090us 15.697us 7.616us 48.87% 7.616us 2.539us 3
4156
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 7.616us 48.87% 7.616us 2.539us 3
4157
+ Activity Buffer Request 80.78% 1.628ms 80.78% 1.628ms 1.628ms 2.688us 17.25% 2.688us 2.688us 1
4158
+ aten::slice 1.22% 24.550us 1.54% 30.960us 5.160us 0.000us 0.00% 0.000us 0.000us 6
4159
+ aten::as_strided 0.32% 6.410us 0.32% 6.410us 1.068us 0.000us 0.00% 0.000us 0.000us 6
4160
+ cudaLaunchKernel 8.75% 176.473us 8.75% 176.473us 29.412us 0.000us 0.00% 0.000us 0.000us 6
4161
+ cudaDeviceSynchronize 0.27% 5.381us 0.27% 5.381us 5.381us 0.000us 0.00% 0.000us 0.000us 1
4162
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4163
+ Self CPU time total: 2.016ms
4164
+ Self CUDA time total: 15.584us
4165
 
4166
 
4167
 
 
4171
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4172
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 156.608us 695.20% 156.608us 156.608us 1
4175
+ torch_eager 4.97% 102.273us 99.73% 2.054ms 2.054ms 0.000us 0.00% 26.431us 26.431us 1
4176
+ aten::silu 1.93% 39.830us 90.91% 1.872ms 624.047us 11.552us 51.28% 15.456us 5.152us 3
4177
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 11.552us 51.28% 11.552us 3.851us 3
4178
+ aten::mul 1.40% 28.900us 2.35% 48.460us 16.153us 10.975us 48.72% 10.975us 3.658us 3
4179
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.975us 48.72% 10.975us 3.658us 3
4180
+ Activity Buffer Request 81.21% 1.672ms 81.21% 1.672ms 1.672ms 3.904us 17.33% 3.904us 3.904us 1
4181
+ aten::slice 1.20% 24.753us 1.50% 30.941us 5.157us 0.000us 0.00% 0.000us 0.000us 6
4182
+ aten::as_strided 0.30% 6.188us 0.30% 6.188us 1.031us 0.000us 0.00% 0.000us 0.000us 6
4183
+ cudaLaunchKernel 8.72% 179.534us 8.72% 179.534us 29.922us 0.000us 0.00% 0.000us 0.000us 6
4184
+ cudaDeviceSynchronize 0.27% 5.530us 0.27% 5.530us 5.530us 0.000us 0.00% 0.000us 0.000us 1
4185
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4186
+ Self CPU time total: 2.059ms
4187
+ Self CUDA time total: 22.527us
4188
 
4189
 
4190
  impl wl p50(ms) ok
4191
+ torch_eager cuda_T128_D1024 0.05 True
4192
  torch_eager cuda_T128_D2048 0.05 True
4193
  torch_eager cuda_T128_D768 0.04 True
4194
  torch_eager cuda_T256_D1024 0.05 True
activation/impls/torch_swiglu_darwin.html ADDED
The diff for this file is too large to render. See raw diff
 
activation/index.html CHANGED
@@ -83,7 +83,7 @@
83
  <h1>Index of /activation</h1>
84
  <ul>
85
  <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
- <li><a href='results_linux/index.html' class='dir'>results_linux/</a></li>
87
  </ul>
88
  </body>
89
  </html>
 
83
  <h1>Index of /activation</h1>
84
  <ul>
85
  <li><a href='impls/index.html' class='dir'>impls/</a></li>
86
+ <li><a href='results_darwin/index.html' class='dir'>results_darwin/</a></li>
87
  </ul>
88
  </body>
89
  </html>
activation/results_darwin/artifacts/combine/latency.svg ADDED

Git LFS Details

  • SHA256: e5289a42b37402c0614d1b53534a0180586b4d0b88305535d38de44de6b50881
  • Pointer size: 128 Bytes
  • Size of remote file: 947 Bytes
activation/results_darwin/cells/combine.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # /// script
2
+ # requires-python = ">=3.10"
3
+ # dependencies = [
4
+ # "numpy",
5
+ # "torch==2.8.0",
6
+ # "kernels-benchmark-tools",
7
+ # "matplotlib"
8
+ # ]
9
+ #
10
+ # [tool.uv.sources]
11
+ # kernels-benchmark-tools = { path = "../../../../../tools", editable = true }
12
+ # ///
13
+ from kernels_benchmark_tools.core.visuals import generate_combined_results
14
+
15
+ # Map display names to uvnote environment variables
16
+ cache_env_map = {
17
+ "PyTorch SwiGLU (macOS)": "UVNOTE_FILE_TORCH_SWIGLU_DARWIN_BENCHMARK",
18
+ }
19
+
20
+ # Generate combined results with visualization
21
+ generate_combined_results(
22
+ cache_env_map=cache_env_map,
23
+ output_filename="activation.jsonl",
24
+ svg_filename="latency.svg"
25
+ )
activation/results_darwin/combined_results.html ADDED
The diff for this file is too large to render. See raw diff
 
activation/results_darwin/index.html ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <meta charset='UTF-8'>
5
+ <meta name='viewport' content='width=device-width, initial-scale=1.0'>
6
+ <title>Index of /activation/results_darwin</title>
7
+ <style>
8
+ :root {
9
+ --bg-primary: #0a0a0a;
10
+ --bg-secondary: #121212;
11
+ --bg-tertiary: #181818;
12
+ --text-primary: #e0e0e0;
13
+ --text-secondary: #888888;
14
+ --text-link: #64b5f6;
15
+ --border-primary: #2a2a2a;
16
+ }
17
+ body {
18
+ font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif;
19
+ background: var(--bg-primary);
20
+ color: var(--text-primary);
21
+ margin: 0;
22
+ padding: 16px;
23
+ max-width: 900px;
24
+ margin: 0 auto;
25
+ }
26
+ .controls {
27
+ display: flex;
28
+ justify-content: flex-end;
29
+ margin-bottom: 1rem;
30
+ }
31
+ .back-button {
32
+ background: var(--bg-secondary);
33
+ border: 1px solid var(--border-primary);
34
+ padding: 8px 12px;
35
+ border-radius: 4px;
36
+ color: var(--text-secondary);
37
+ cursor: pointer;
38
+ font-size: 0.9rem;
39
+ text-decoration: none;
40
+ display: inline-block;
41
+ }
42
+ .back-button:hover {
43
+ color: var(--text-primary);
44
+ background: var(--bg-tertiary);
45
+ }
46
+ h1 {
47
+ font-size: 1.5em;
48
+ margin: 1rem 0;
49
+ color: var(--text-primary);
50
+ border-bottom: 1px solid var(--border-primary);
51
+ padding-bottom: 0.5rem;
52
+ }
53
+ ul {
54
+ list-style-type: none;
55
+ padding: 0;
56
+ }
57
+ li {
58
+ margin: 0;
59
+ border-bottom: 1px solid var(--border-primary);
60
+ }
61
+ li:last-child {
62
+ border-bottom: none;
63
+ }
64
+ a {
65
+ display: block;
66
+ padding: 0.75rem 0.5rem;
67
+ text-decoration: none;
68
+ color: var(--text-link);
69
+ transition: background 0.2s ease;
70
+ }
71
+ a:hover {
72
+ background: var(--bg-secondary);
73
+ }
74
+ .dir {
75
+ font-weight: 500;
76
+ }
77
+ </style>
78
+ </head>
79
+ <body>
80
+ <div class='controls'>
81
+ <a href='../index.html' class='back-button'>← back</a>
82
+ </div>
83
+ <h1>Index of /activation/results_darwin</h1>
84
+ <ul>
85
+ <li><a href='combined_results.html' class='file'>combined_results.html</a></li>
86
+ </ul>
87
+ </body>
88
+ </html>
activation/results_linux/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: e0ebb1563a7889f083e3a946d975f77f909986c659d9bdfad99579689fd355e5
  • Pointer size: 130 Bytes
  • Size of remote file: 21.5 kB

Git LFS Details

  • SHA256: 2c3c56353a22a5acffe8087c058e833d475a7a6c5bc584cf64d87b43035f7eef
  • Pointer size: 130 Bytes
  • Size of remote file: 20.7 kB
activation/results_linux/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:55:34.708013</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -4038,96 +4038,83 @@ body[data-tool="eraser"] .main-content {
4038
  <g id="matplotlib.axis_2">
4039
  <g id="ytick_1">
4040
  <g id="grid-y--2" class="grid grid-y">
4041
- <path d="M 60.23 448.789427 L 847.294169 448.789427 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4042
  </g>
4043
  <g id="line2d_10">
4044
  <defs>
4045
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4046
  </defs>
4047
  <g>
4048
- <use ns4:href="#m0fca2865ba" x="60.23" y="448.789427" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="452.588646" transform="rotate(-0 53.23 452.588646)">0.025</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_2">
4056
  <g id="grid-y--3" class="grid grid-y">
4057
- <path d="M 60.23 382.723832 L 847.294169 382.723832 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
- <use ns4:href="#m0fca2865ba" x="60.23" y="382.723832" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="386.523051" transform="rotate(-0 53.23 386.523051)">0.030</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_3">
4069
  <g id="grid-y--4" class="grid grid-y">
4070
- <path d="M 60.23 316.658237 L 847.294169 316.658237 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
- <use ns4:href="#m0fca2865ba" x="60.23" y="316.658237" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="320.457455" transform="rotate(-0 53.23 320.457455)">0.035</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_4">
4082
  <g id="grid-y--5" class="grid grid-y">
4083
- <path d="M 60.23 250.592641 L 847.294169 250.592641 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
- <use ns4:href="#m0fca2865ba" x="60.23" y="250.592641" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="254.39186" transform="rotate(-0 53.23 254.39186)">0.040</text>
4092
  </g>
4093
  </g>
4094
  <g id="ytick_5">
4095
  <g id="grid-y--6" class="grid grid-y">
4096
- <path d="M 60.23 184.527046 L 847.294169 184.527046 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4097
  </g>
4098
  <g id="line2d_14">
4099
  <g>
4100
- <use ns4:href="#m0fca2865ba" x="60.23" y="184.527046" style="stroke: #000000; stroke-width: 0.8" />
4101
  </g>
4102
  </g>
4103
  <g id="text_14">
4104
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="188.326264" transform="rotate(-0 53.23 188.326264)">0.045</text>
4105
  </g>
4106
  </g>
4107
  <g id="ytick_6">
4108
  <g id="grid-y--7" class="grid grid-y">
4109
- <path d="M 60.23 118.46145 L 847.294169 118.46145 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4110
  </g>
4111
  <g id="line2d_15">
4112
  <g>
4113
- <use ns4:href="#m0fca2865ba" x="60.23" y="118.46145" style="stroke: #000000; stroke-width: 0.8" />
4114
  </g>
4115
  </g>
4116
  <g id="text_15">
4117
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="122.260669" transform="rotate(-0 53.23 122.260669)">0.050</text>
4118
- </g>
4119
- </g>
4120
- <g id="ytick_7">
4121
- <g id="grid-y--8" class="grid grid-y">
4122
- <path d="M 60.23 52.395855 L 847.294169 52.395855 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4123
- </g>
4124
- <g id="line2d_16">
4125
- <g>
4126
- <use ns4:href="#m0fca2865ba" x="60.23" y="52.395855" style="stroke: #000000; stroke-width: 0.8" />
4127
- </g>
4128
- </g>
4129
- <g id="text_16">
4130
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="56.195074" transform="rotate(-0 53.23 56.195074)">0.055</text>
4131
  </g>
4132
  </g>
4133
  <g id="label--y" class="ylabel">
@@ -4135,37 +4122,37 @@ body[data-tool="eraser"] .main-content {
4135
  </g>
4136
  </g>
4137
  <g id="series--hf-kernels-swiglu" class="series">
4138
- <path d="M 96.005644 451.16779 L 185.444754 375.575536 L 274.883864 378.218159 L 364.322974 389.185046 L 453.762084 390.255309 L 543.201194 395.791605 L 632.640304 397.773576 L 722.079415 392.884719 L 811.518525 395.13095 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4139
  <defs>
4140
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4141
  </defs>
4142
  <g clip-path="url(#p620c7d392f)">
4143
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4144
- <use ns4:href="#md7efaf3aec" x="185.444754" y="375.575536" style="fill: #1f77b4; stroke: #1f77b4" />
4145
- <use ns4:href="#md7efaf3aec" x="274.883864" y="378.218159" style="fill: #1f77b4; stroke: #1f77b4" />
4146
- <use ns4:href="#md7efaf3aec" x="364.322974" y="389.185046" style="fill: #1f77b4; stroke: #1f77b4" />
4147
- <use ns4:href="#md7efaf3aec" x="453.762084" y="390.255309" style="fill: #1f77b4; stroke: #1f77b4" />
4148
- <use ns4:href="#md7efaf3aec" x="543.201194" y="395.791605" style="fill: #1f77b4; stroke: #1f77b4" />
4149
- <use ns4:href="#md7efaf3aec" x="632.640304" y="397.773576" style="fill: #1f77b4; stroke: #1f77b4" />
4150
- <use ns4:href="#md7efaf3aec" x="722.079415" y="392.884719" style="fill: #1f77b4; stroke: #1f77b4" />
4151
- <use ns4:href="#md7efaf3aec" x="811.518525" y="395.13095" style="fill: #1f77b4; stroke: #1f77b4" />
4152
  </g>
4153
  </g>
4154
  <g id="series--torch-eager" class="series">
4155
- <path d="M 96.005644 201.030231 L 185.444754 47.08418 L 274.883864 61.090089 L 364.322974 57.918941 L 453.762084 59.121331 L 543.201194 85.943964 L 632.640304 75.492387 L 722.079415 58.711725 L 811.518525 70.220354 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4156
  <defs>
4157
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4158
  </defs>
4159
  <g clip-path="url(#p620c7d392f)">
4160
- <use ns4:href="#m9b8c54d372" x="96.005644" y="201.030231" style="fill: #ff7f0e; stroke: #ff7f0e" />
4161
- <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4162
- <use ns4:href="#m9b8c54d372" x="274.883864" y="61.090089" style="fill: #ff7f0e; stroke: #ff7f0e" />
4163
- <use ns4:href="#m9b8c54d372" x="364.322974" y="57.918941" style="fill: #ff7f0e; stroke: #ff7f0e" />
4164
- <use ns4:href="#m9b8c54d372" x="453.762084" y="59.121331" style="fill: #ff7f0e; stroke: #ff7f0e" />
4165
- <use ns4:href="#m9b8c54d372" x="543.201194" y="85.943964" style="fill: #ff7f0e; stroke: #ff7f0e" />
4166
- <use ns4:href="#m9b8c54d372" x="632.640304" y="75.492387" style="fill: #ff7f0e; stroke: #ff7f0e" />
4167
- <use ns4:href="#m9b8c54d372" x="722.079415" y="58.711725" style="fill: #ff7f0e; stroke: #ff7f0e" />
4168
- <use ns4:href="#m9b8c54d372" x="811.518525" y="70.220354" style="fill: #ff7f0e; stroke: #ff7f0e" />
4169
  </g>
4170
  </g>
4171
  <g id="patch_3">
@@ -4180,14 +4167,14 @@ body[data-tool="eraser"] .main-content {
4180
  <g id="patch_6">
4181
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4182
  </g>
4183
- <g id="text_17">
4184
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4185
  </g>
4186
  <g id="legend" class="legend">
4187
  <g id="patch_7">
4188
  <path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4189
  </g>
4190
- <g id="line2d_17">
4191
  <path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4192
  <g>
4193
  <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
@@ -4196,7 +4183,7 @@ body[data-tool="eraser"] .main-content {
4196
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4197
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
4198
  </g>
4199
- <g id="line2d_18">
4200
  <path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4201
  <g>
4202
  <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
@@ -4223,7 +4210,7 @@ body[data-tool="eraser"] .main-content {
4223
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4224
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4225
  </span> |
4226
- Cell: combine | 4.50s
4227
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4228
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4229
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4319,7 +4306,7 @@ hf_kernels_swiglu cuda_T256_D768 0.03 True
4319
  hf_kernels_swiglu cuda_T512_D1024 0.03 True
4320
  hf_kernels_swiglu cuda_T512_D2048 0.03 True
4321
  hf_kernels_swiglu cuda_T512_D768 0.03 True
4322
- torch_eager cuda_T128_D1024 0.06 True
4323
  torch_eager cuda_T128_D2048 0.05 True
4324
  torch_eager cuda_T128_D768 0.04 True
4325
  torch_eager cuda_T256_D1024 0.05 True
@@ -4347,7 +4334,7 @@ Implementations included:
4347
  <div class="uv-install-logs" id="uv-logs-combine">
4348
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4349
  <div class="uv-logs-content" style="display: none;">
4350
- Installed 37 packages in 206ms
4351
  </div>
4352
  </div>
4353
  <div class="cell-artifacts">
@@ -4360,7 +4347,7 @@ Installed 37 packages in 206ms
4360
  <rdf:RDF>
4361
  <ns2:Work>
4362
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4363
- <dc:date>2025-12-19T19:55:34.708013</dc:date>
4364
  <dc:format>image/svg+xml</dc:format>
4365
  <dc:creator>
4366
  <ns2:Agent>
@@ -4509,96 +4496,83 @@ Installed 37 packages in 206ms
4509
  <g id="matplotlib.axis_2">
4510
  <g id="ytick_1">
4511
  <g id="grid-y--2" class="grid grid-y">
4512
- <path d="M 60.23 448.789427 L 847.294169 448.789427 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4513
  </g>
4514
  <g id="line2d_10">
4515
  <defs>
4516
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4517
  </defs>
4518
  <g>
4519
- <use ns4:href="#m0fca2865ba" x="60.23" y="448.789427" style="stroke: #000000; stroke-width: 0.8" />
4520
  </g>
4521
  </g>
4522
  <g id="text_10">
4523
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="452.588646" transform="rotate(-0 53.23 452.588646)">0.025</text>
4524
  </g>
4525
  </g>
4526
  <g id="ytick_2">
4527
  <g id="grid-y--3" class="grid grid-y">
4528
- <path d="M 60.23 382.723832 L 847.294169 382.723832 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4529
  </g>
4530
  <g id="line2d_11">
4531
  <g>
4532
- <use ns4:href="#m0fca2865ba" x="60.23" y="382.723832" style="stroke: #000000; stroke-width: 0.8" />
4533
  </g>
4534
  </g>
4535
  <g id="text_11">
4536
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="386.523051" transform="rotate(-0 53.23 386.523051)">0.030</text>
4537
  </g>
4538
  </g>
4539
  <g id="ytick_3">
4540
  <g id="grid-y--4" class="grid grid-y">
4541
- <path d="M 60.23 316.658237 L 847.294169 316.658237 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4542
  </g>
4543
  <g id="line2d_12">
4544
  <g>
4545
- <use ns4:href="#m0fca2865ba" x="60.23" y="316.658237" style="stroke: #000000; stroke-width: 0.8" />
4546
  </g>
4547
  </g>
4548
  <g id="text_12">
4549
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="320.457455" transform="rotate(-0 53.23 320.457455)">0.035</text>
4550
  </g>
4551
  </g>
4552
  <g id="ytick_4">
4553
  <g id="grid-y--5" class="grid grid-y">
4554
- <path d="M 60.23 250.592641 L 847.294169 250.592641 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4555
  </g>
4556
  <g id="line2d_13">
4557
  <g>
4558
- <use ns4:href="#m0fca2865ba" x="60.23" y="250.592641" style="stroke: #000000; stroke-width: 0.8" />
4559
  </g>
4560
  </g>
4561
  <g id="text_13">
4562
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="254.39186" transform="rotate(-0 53.23 254.39186)">0.040</text>
4563
  </g>
4564
  </g>
4565
  <g id="ytick_5">
4566
  <g id="grid-y--6" class="grid grid-y">
4567
- <path d="M 60.23 184.527046 L 847.294169 184.527046 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4568
  </g>
4569
  <g id="line2d_14">
4570
  <g>
4571
- <use ns4:href="#m0fca2865ba" x="60.23" y="184.527046" style="stroke: #000000; stroke-width: 0.8" />
4572
  </g>
4573
  </g>
4574
  <g id="text_14">
4575
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="188.326264" transform="rotate(-0 53.23 188.326264)">0.045</text>
4576
  </g>
4577
  </g>
4578
  <g id="ytick_6">
4579
  <g id="grid-y--7" class="grid grid-y">
4580
- <path d="M 60.23 118.46145 L 847.294169 118.46145 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4581
  </g>
4582
  <g id="line2d_15">
4583
  <g>
4584
- <use ns4:href="#m0fca2865ba" x="60.23" y="118.46145" style="stroke: #000000; stroke-width: 0.8" />
4585
  </g>
4586
  </g>
4587
  <g id="text_15">
4588
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="122.260669" transform="rotate(-0 53.23 122.260669)">0.050</text>
4589
- </g>
4590
- </g>
4591
- <g id="ytick_7">
4592
- <g id="grid-y--8" class="grid grid-y">
4593
- <path d="M 60.23 52.395855 L 847.294169 52.395855 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4594
- </g>
4595
- <g id="line2d_16">
4596
- <g>
4597
- <use ns4:href="#m0fca2865ba" x="60.23" y="52.395855" style="stroke: #000000; stroke-width: 0.8" />
4598
- </g>
4599
- </g>
4600
- <g id="text_16">
4601
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="56.195074" transform="rotate(-0 53.23 56.195074)">0.055</text>
4602
  </g>
4603
  </g>
4604
  <g id="label--y" class="ylabel">
@@ -4606,37 +4580,37 @@ Installed 37 packages in 206ms
4606
  </g>
4607
  </g>
4608
  <g id="series--hf-kernels-swiglu" class="series">
4609
- <path d="M 96.005644 451.16779 L 185.444754 375.575536 L 274.883864 378.218159 L 364.322974 389.185046 L 453.762084 390.255309 L 543.201194 395.791605 L 632.640304 397.773576 L 722.079415 392.884719 L 811.518525 395.13095 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4610
  <defs>
4611
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4612
  </defs>
4613
  <g clip-path="url(#p620c7d392f)">
4614
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4615
- <use ns4:href="#md7efaf3aec" x="185.444754" y="375.575536" style="fill: #1f77b4; stroke: #1f77b4" />
4616
- <use ns4:href="#md7efaf3aec" x="274.883864" y="378.218159" style="fill: #1f77b4; stroke: #1f77b4" />
4617
- <use ns4:href="#md7efaf3aec" x="364.322974" y="389.185046" style="fill: #1f77b4; stroke: #1f77b4" />
4618
- <use ns4:href="#md7efaf3aec" x="453.762084" y="390.255309" style="fill: #1f77b4; stroke: #1f77b4" />
4619
- <use ns4:href="#md7efaf3aec" x="543.201194" y="395.791605" style="fill: #1f77b4; stroke: #1f77b4" />
4620
- <use ns4:href="#md7efaf3aec" x="632.640304" y="397.773576" style="fill: #1f77b4; stroke: #1f77b4" />
4621
- <use ns4:href="#md7efaf3aec" x="722.079415" y="392.884719" style="fill: #1f77b4; stroke: #1f77b4" />
4622
- <use ns4:href="#md7efaf3aec" x="811.518525" y="395.13095" style="fill: #1f77b4; stroke: #1f77b4" />
4623
  </g>
4624
  </g>
4625
  <g id="series--torch-eager" class="series">
4626
- <path d="M 96.005644 201.030231 L 185.444754 47.08418 L 274.883864 61.090089 L 364.322974 57.918941 L 453.762084 59.121331 L 543.201194 85.943964 L 632.640304 75.492387 L 722.079415 58.711725 L 811.518525 70.220354 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4627
  <defs>
4628
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4629
  </defs>
4630
  <g clip-path="url(#p620c7d392f)">
4631
- <use ns4:href="#m9b8c54d372" x="96.005644" y="201.030231" style="fill: #ff7f0e; stroke: #ff7f0e" />
4632
- <use ns4:href="#m9b8c54d372" x="185.444754" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4633
- <use ns4:href="#m9b8c54d372" x="274.883864" y="61.090089" style="fill: #ff7f0e; stroke: #ff7f0e" />
4634
- <use ns4:href="#m9b8c54d372" x="364.322974" y="57.918941" style="fill: #ff7f0e; stroke: #ff7f0e" />
4635
- <use ns4:href="#m9b8c54d372" x="453.762084" y="59.121331" style="fill: #ff7f0e; stroke: #ff7f0e" />
4636
- <use ns4:href="#m9b8c54d372" x="543.201194" y="85.943964" style="fill: #ff7f0e; stroke: #ff7f0e" />
4637
- <use ns4:href="#m9b8c54d372" x="632.640304" y="75.492387" style="fill: #ff7f0e; stroke: #ff7f0e" />
4638
- <use ns4:href="#m9b8c54d372" x="722.079415" y="58.711725" style="fill: #ff7f0e; stroke: #ff7f0e" />
4639
- <use ns4:href="#m9b8c54d372" x="811.518525" y="70.220354" style="fill: #ff7f0e; stroke: #ff7f0e" />
4640
  </g>
4641
  </g>
4642
  <g id="patch_3">
@@ -4651,14 +4625,14 @@ Installed 37 packages in 206ms
4651
  <g id="patch_6">
4652
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4653
  </g>
4654
- <g id="text_17">
4655
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4656
  </g>
4657
  <g id="legend" class="legend">
4658
  <g id="patch_7">
4659
  <path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4660
  </g>
4661
- <g id="line2d_17">
4662
  <path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4663
  <g>
4664
  <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
@@ -4667,7 +4641,7 @@ Installed 37 packages in 206ms
4667
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4668
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
4669
  </g>
4670
- <g id="line2d_18">
4671
  <path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4672
  <g>
4673
  <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T23:02:36.234026</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
4038
  <g id="matplotlib.axis_2">
4039
  <g id="ytick_1">
4040
  <g id="grid-y--2" class="grid grid-y">
4041
+ <path d="M 60.23 439.989819 L 847.294169 439.989819 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4042
  </g>
4043
  <g id="line2d_10">
4044
  <defs>
4045
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4046
  </defs>
4047
  <g>
4048
+ <use ns4:href="#m0fca2865ba" x="60.23" y="439.989819" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="443.789038" transform="rotate(-0 53.23 443.789038)">0.025</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_2">
4056
  <g id="grid-y--3" class="grid grid-y">
4057
+ <path d="M 60.23 364.462996 L 847.294169 364.462996 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
+ <use ns4:href="#m0fca2865ba" x="60.23" y="364.462996" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="368.262215" transform="rotate(-0 53.23 368.262215)">0.030</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_3">
4069
  <g id="grid-y--4" class="grid grid-y">
4070
+ <path d="M 60.23 288.936174 L 847.294169 288.936174 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
+ <use ns4:href="#m0fca2865ba" x="60.23" y="288.936174" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="292.735392" transform="rotate(-0 53.23 292.735392)">0.035</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_4">
4082
  <g id="grid-y--5" class="grid grid-y">
4083
+ <path d="M 60.23 213.409351 L 847.294169 213.409351 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
+ <use ns4:href="#m0fca2865ba" x="60.23" y="213.409351" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.20857" transform="rotate(-0 53.23 217.20857)">0.040</text>
4092
  </g>
4093
  </g>
4094
  <g id="ytick_5">
4095
  <g id="grid-y--6" class="grid grid-y">
4096
+ <path d="M 60.23 137.882528 L 847.294169 137.882528 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4097
  </g>
4098
  <g id="line2d_14">
4099
  <g>
4100
+ <use ns4:href="#m0fca2865ba" x="60.23" y="137.882528" style="stroke: #000000; stroke-width: 0.8" />
4101
  </g>
4102
  </g>
4103
  <g id="text_14">
4104
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="141.681747" transform="rotate(-0 53.23 141.681747)">0.045</text>
4105
  </g>
4106
  </g>
4107
  <g id="ytick_6">
4108
  <g id="grid-y--7" class="grid grid-y">
4109
+ <path d="M 60.23 62.355705 L 847.294169 62.355705 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4110
  </g>
4111
  <g id="line2d_15">
4112
  <g>
4113
+ <use ns4:href="#m0fca2865ba" x="60.23" y="62.355705" style="stroke: #000000; stroke-width: 0.8" />
4114
  </g>
4115
  </g>
4116
  <g id="text_15">
4117
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="66.154924" transform="rotate(-0 53.23 66.154924)">0.050</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
4118
  </g>
4119
  </g>
4120
  <g id="label--y" class="ylabel">
 
4122
  </g>
4123
  </g>
4124
  <g id="series--hf-kernels-swiglu" class="series">
4125
+ <path d="M 96.005644 451.16779 L 185.444754 381.81906 L 274.883864 399.492335 L 364.322974 403.132727 L 453.762084 404.326052 L 543.201194 418.540201 L 632.640304 397.090583 L 722.079415 413.087163 L 811.518525 398.299013 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4128
  </defs>
4129
  <g clip-path="url(#p620c7d392f)">
4130
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4131
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="381.81906" style="fill: #1f77b4; stroke: #1f77b4" />
4132
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="399.492335" style="fill: #1f77b4; stroke: #1f77b4" />
4133
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="403.132727" style="fill: #1f77b4; stroke: #1f77b4" />
4134
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="404.326052" style="fill: #1f77b4; stroke: #1f77b4" />
4135
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="418.540201" style="fill: #1f77b4; stroke: #1f77b4" />
4136
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="397.090583" style="fill: #1f77b4; stroke: #1f77b4" />
4137
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="413.087163" style="fill: #1f77b4; stroke: #1f77b4" />
4138
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="398.299013" style="fill: #1f77b4; stroke: #1f77b4" />
4139
  </g>
4140
  </g>
4141
  <g id="series--torch-eager" class="series">
4142
+ <path d="M 96.005644 187.579177 L 185.444754 58.095992 L 274.883864 61.268119 L 364.322974 62.778657 L 453.762084 62.340599 L 543.201194 67.476423 L 632.640304 47.08418 L 722.079415 54.03265 L 811.518525 57.204778 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4143
  <defs>
4144
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4145
  </defs>
4146
  <g clip-path="url(#p620c7d392f)">
4147
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="187.579177" style="fill: #ff7f0e; stroke: #ff7f0e" />
4148
+ <use ns4:href="#m9b8c54d372" x="185.444754" y="58.095992" style="fill: #ff7f0e; stroke: #ff7f0e" />
4149
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="61.268119" style="fill: #ff7f0e; stroke: #ff7f0e" />
4150
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="62.778657" style="fill: #ff7f0e; stroke: #ff7f0e" />
4151
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="62.340599" style="fill: #ff7f0e; stroke: #ff7f0e" />
4152
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="67.476423" style="fill: #ff7f0e; stroke: #ff7f0e" />
4153
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4154
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="54.03265" style="fill: #ff7f0e; stroke: #ff7f0e" />
4155
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="57.204778" style="fill: #ff7f0e; stroke: #ff7f0e" />
4156
  </g>
4157
  </g>
4158
  <g id="patch_3">
 
4167
  <g id="patch_6">
4168
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4169
  </g>
4170
+ <g id="text_16">
4171
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4172
  </g>
4173
  <g id="legend" class="legend">
4174
  <g id="patch_7">
4175
  <path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4176
  </g>
4177
+ <g id="line2d_16">
4178
  <path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4179
  <g>
4180
  <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
 
4183
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4184
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
4185
  </g>
4186
+ <g id="line2d_17">
4187
  <path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4188
  <g>
4189
  <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
 
4210
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4211
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4212
  </span> |
4213
+ Cell: combine | 4.66s
4214
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4215
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4216
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4306
  hf_kernels_swiglu cuda_T512_D1024 0.03 True
4307
  hf_kernels_swiglu cuda_T512_D2048 0.03 True
4308
  hf_kernels_swiglu cuda_T512_D768 0.03 True
4309
+ torch_eager cuda_T128_D1024 0.05 True
4310
  torch_eager cuda_T128_D2048 0.05 True
4311
  torch_eager cuda_T128_D768 0.04 True
4312
  torch_eager cuda_T256_D1024 0.05 True
 
4334
  <div class="uv-install-logs" id="uv-logs-combine">
4335
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4336
  <div class="uv-logs-content" style="display: none;">
4337
+ Installed 37 packages in 339ms
4338
  </div>
4339
  </div>
4340
  <div class="cell-artifacts">
 
4347
  <rdf:RDF>
4348
  <ns2:Work>
4349
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4350
+ <dc:date>2025-12-19T23:02:36.234026</dc:date>
4351
  <dc:format>image/svg+xml</dc:format>
4352
  <dc:creator>
4353
  <ns2:Agent>
 
4496
  <g id="matplotlib.axis_2">
4497
  <g id="ytick_1">
4498
  <g id="grid-y--2" class="grid grid-y">
4499
+ <path d="M 60.23 439.989819 L 847.294169 439.989819 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4500
  </g>
4501
  <g id="line2d_10">
4502
  <defs>
4503
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4504
  </defs>
4505
  <g>
4506
+ <use ns4:href="#m0fca2865ba" x="60.23" y="439.989819" style="stroke: #000000; stroke-width: 0.8" />
4507
  </g>
4508
  </g>
4509
  <g id="text_10">
4510
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="443.789038" transform="rotate(-0 53.23 443.789038)">0.025</text>
4511
  </g>
4512
  </g>
4513
  <g id="ytick_2">
4514
  <g id="grid-y--3" class="grid grid-y">
4515
+ <path d="M 60.23 364.462996 L 847.294169 364.462996 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4516
  </g>
4517
  <g id="line2d_11">
4518
  <g>
4519
+ <use ns4:href="#m0fca2865ba" x="60.23" y="364.462996" style="stroke: #000000; stroke-width: 0.8" />
4520
  </g>
4521
  </g>
4522
  <g id="text_11">
4523
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="368.262215" transform="rotate(-0 53.23 368.262215)">0.030</text>
4524
  </g>
4525
  </g>
4526
  <g id="ytick_3">
4527
  <g id="grid-y--4" class="grid grid-y">
4528
+ <path d="M 60.23 288.936174 L 847.294169 288.936174 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4529
  </g>
4530
  <g id="line2d_12">
4531
  <g>
4532
+ <use ns4:href="#m0fca2865ba" x="60.23" y="288.936174" style="stroke: #000000; stroke-width: 0.8" />
4533
  </g>
4534
  </g>
4535
  <g id="text_12">
4536
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="292.735392" transform="rotate(-0 53.23 292.735392)">0.035</text>
4537
  </g>
4538
  </g>
4539
  <g id="ytick_4">
4540
  <g id="grid-y--5" class="grid grid-y">
4541
+ <path d="M 60.23 213.409351 L 847.294169 213.409351 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4542
  </g>
4543
  <g id="line2d_13">
4544
  <g>
4545
+ <use ns4:href="#m0fca2865ba" x="60.23" y="213.409351" style="stroke: #000000; stroke-width: 0.8" />
4546
  </g>
4547
  </g>
4548
  <g id="text_13">
4549
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="217.20857" transform="rotate(-0 53.23 217.20857)">0.040</text>
4550
  </g>
4551
  </g>
4552
  <g id="ytick_5">
4553
  <g id="grid-y--6" class="grid grid-y">
4554
+ <path d="M 60.23 137.882528 L 847.294169 137.882528 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4555
  </g>
4556
  <g id="line2d_14">
4557
  <g>
4558
+ <use ns4:href="#m0fca2865ba" x="60.23" y="137.882528" style="stroke: #000000; stroke-width: 0.8" />
4559
  </g>
4560
  </g>
4561
  <g id="text_14">
4562
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="141.681747" transform="rotate(-0 53.23 141.681747)">0.045</text>
4563
  </g>
4564
  </g>
4565
  <g id="ytick_6">
4566
  <g id="grid-y--7" class="grid grid-y">
4567
+ <path d="M 60.23 62.355705 L 847.294169 62.355705 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4568
  </g>
4569
  <g id="line2d_15">
4570
  <g>
4571
+ <use ns4:href="#m0fca2865ba" x="60.23" y="62.355705" style="stroke: #000000; stroke-width: 0.8" />
4572
  </g>
4573
  </g>
4574
  <g id="text_15">
4575
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="53.23" y="66.154924" transform="rotate(-0 53.23 66.154924)">0.050</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
4576
  </g>
4577
  </g>
4578
  <g id="label--y" class="ylabel">
 
4580
  </g>
4581
  </g>
4582
  <g id="series--hf-kernels-swiglu" class="series">
4583
+ <path d="M 96.005644 451.16779 L 185.444754 381.81906 L 274.883864 399.492335 L 364.322974 403.132727 L 453.762084 404.326052 L 543.201194 418.540201 L 632.640304 397.090583 L 722.079415 413.087163 L 811.518525 398.299013 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4584
  <defs>
4585
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4586
  </defs>
4587
  <g clip-path="url(#p620c7d392f)">
4588
  <use ns4:href="#md7efaf3aec" x="96.005644" y="451.16779" style="fill: #1f77b4; stroke: #1f77b4" />
4589
+ <use ns4:href="#md7efaf3aec" x="185.444754" y="381.81906" style="fill: #1f77b4; stroke: #1f77b4" />
4590
+ <use ns4:href="#md7efaf3aec" x="274.883864" y="399.492335" style="fill: #1f77b4; stroke: #1f77b4" />
4591
+ <use ns4:href="#md7efaf3aec" x="364.322974" y="403.132727" style="fill: #1f77b4; stroke: #1f77b4" />
4592
+ <use ns4:href="#md7efaf3aec" x="453.762084" y="404.326052" style="fill: #1f77b4; stroke: #1f77b4" />
4593
+ <use ns4:href="#md7efaf3aec" x="543.201194" y="418.540201" style="fill: #1f77b4; stroke: #1f77b4" />
4594
+ <use ns4:href="#md7efaf3aec" x="632.640304" y="397.090583" style="fill: #1f77b4; stroke: #1f77b4" />
4595
+ <use ns4:href="#md7efaf3aec" x="722.079415" y="413.087163" style="fill: #1f77b4; stroke: #1f77b4" />
4596
+ <use ns4:href="#md7efaf3aec" x="811.518525" y="398.299013" style="fill: #1f77b4; stroke: #1f77b4" />
4597
  </g>
4598
  </g>
4599
  <g id="series--torch-eager" class="series">
4600
+ <path d="M 96.005644 187.579177 L 185.444754 58.095992 L 274.883864 61.268119 L 364.322974 62.778657 L 453.762084 62.340599 L 543.201194 67.476423 L 632.640304 47.08418 L 722.079415 54.03265 L 811.518525 57.204778 " clip-path="url(#p620c7d392f)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4601
  <defs>
4602
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4603
  </defs>
4604
  <g clip-path="url(#p620c7d392f)">
4605
+ <use ns4:href="#m9b8c54d372" x="96.005644" y="187.579177" style="fill: #ff7f0e; stroke: #ff7f0e" />
4606
+ <use ns4:href="#m9b8c54d372" x="185.444754" y="58.095992" style="fill: #ff7f0e; stroke: #ff7f0e" />
4607
+ <use ns4:href="#m9b8c54d372" x="274.883864" y="61.268119" style="fill: #ff7f0e; stroke: #ff7f0e" />
4608
+ <use ns4:href="#m9b8c54d372" x="364.322974" y="62.778657" style="fill: #ff7f0e; stroke: #ff7f0e" />
4609
+ <use ns4:href="#m9b8c54d372" x="453.762084" y="62.340599" style="fill: #ff7f0e; stroke: #ff7f0e" />
4610
+ <use ns4:href="#m9b8c54d372" x="543.201194" y="67.476423" style="fill: #ff7f0e; stroke: #ff7f0e" />
4611
+ <use ns4:href="#m9b8c54d372" x="632.640304" y="47.08418" style="fill: #ff7f0e; stroke: #ff7f0e" />
4612
+ <use ns4:href="#m9b8c54d372" x="722.079415" y="54.03265" style="fill: #ff7f0e; stroke: #ff7f0e" />
4613
+ <use ns4:href="#m9b8c54d372" x="811.518525" y="57.204778" style="fill: #ff7f0e; stroke: #ff7f0e" />
4614
  </g>
4615
  </g>
4616
  <g id="patch_3">
 
4625
  <g id="patch_6">
4626
  <path d="M 60.23 26.88 L 847.294169 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4627
  </g>
4628
+ <g id="text_16">
4629
  <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="453.762084" y="20.88" transform="rotate(-0 453.762084 20.88)">Attention Implementation Latency</text>
4630
  </g>
4631
  <g id="legend" class="legend">
4632
  <g id="patch_7">
4633
  <path d="M 720.811356 466.37197 L 840.294169 466.37197 Q 842.294169 466.37197 842.294169 464.37197 L 842.294169 435.45947 Q 842.294169 433.45947 840.294169 433.45947 L 720.811356 433.45947 Q 718.811356 433.45947 718.811356 435.45947 L 718.811356 464.37197 Q 718.811356 466.37197 720.811356 466.37197 L 720.811356 466.37197 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4634
  </g>
4635
+ <g id="line2d_16">
4636
  <path d="M 722.811356 441.557908 L 732.811356 441.557908 L 742.811356 441.557908 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4637
  <g>
4638
  <use ns4:href="#md7efaf3aec" x="732.811356" y="441.557908" style="fill: #1f77b4; stroke: #1f77b4" />
 
4641
  <g id="legend-label--hf-kernels-swiglu" class="legend">
4642
  <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="750.811356" y="445.057908" transform="rotate(-0 750.811356 445.057908)">hf_kernels_swiglu</text>
4643
  </g>
4644
+ <g id="line2d_17">
4645
  <path d="M 722.811356 456.514158 L 732.811356 456.514158 L 742.811356 456.514158 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4646
  <g>
4647
  <use ns4:href="#m9b8c54d372" x="732.811356" y="456.514158" style="fill: #ff7f0e; stroke: #ff7f0e" />
causal_conv1d/impls/artifacts/benchmark/causal_conv1d.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04411100007928326, "p50": 0.04549100003714557, "p90": 0.046580999878642615, "mean": 0.045672999976886786, "iqr": 0.0013399999261309858, "raw_times": [0.04524099995251163, 0.04549100003714557, 0.04694099993685086, 0.046580999878642615, 0.04411100007928326], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05584099994848657, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05282100005388202, "p50": 0.05301199985296989, "p90": 0.053511999794864096, "mean": 0.053615399929185514, "iqr": 0.0006809998467360856, "raw_times": [0.05282100005388202, 0.05301199985296989, 0.05283099994812801, 0.055900999996083556, 0.053511999794864096], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05840199992235284, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05104100000608014, "p50": 0.05199099996389123, "p90": 0.052391000053830794, "mean": 0.05195319999984349, "iqr": 0.0011390000054234406, "raw_times": [0.05125200004840735, 0.05309099992700794, 0.05199099996389123, 0.05104100000608014, 0.052391000053830794], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05686200006493891, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049951999926634016, "p50": 0.052720999974553706, "p90": 0.05275099988466536, "mean": 0.05206719993111619, "iqr": 0.001500000053056283, "raw_times": [0.049951999926634016, 0.052720999974553706, 0.0536610000381188, 0.05275099988466536, 0.05125099983160908], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.056871999959184905, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04956099996888952, "p50": 0.05162100001143699, "p90": 0.05194100003791391, "mean": 0.05130520003149286, "iqr": 0.0009590000900061568, "raw_times": [0.05194100003791391, 0.052421000191316125, 0.05098199994790775, 0.04956099996888952, 0.05162100001143699], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05496100015989214, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05030099987379799, "p50": 0.05167099993741431, "p90": 0.05240099994807679, "mean": 0.05315919993336138, "iqr": 0.00083000008999079, "raw_times": [0.05030099987379799, 0.05985200004943181, 0.05240099994807679, 0.051570999858086, 0.05167099993741431], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055181000107040745, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04961100012224051, "p50": 0.05098099995848315, "p90": 0.05157100008545967, "mean": 0.05098120004731754, "iqr": 0.0007500000265281415, "raw_times": [0.04961100012224051, 0.051922000011472846, 0.05082100005893153, 0.05098099995848315, 0.05157100008545967], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053892000096311676, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049450999995315215, "p50": 0.049821000175143126, "p90": 0.05057099997429759, "mean": 0.050117000046157045, "iqr": 0.0009599998520570807, "raw_times": [0.049450999995315215, 0.049821000175143126, 0.05113099996378878, 0.05057099997429759, 0.04961100012224051], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0559909999537922, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04905100013274932, "p50": 0.05060100011178292, "p90": 0.05113099996378878, "mean": 0.051737200055868016, "iqr": 0.0005899998996028444, "raw_times": [0.057362000006833114, 0.05113099996378878, 0.05060100011178292, 0.05054100006418594, 0.04905100013274932], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05664199989041663, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05021099991608935, "p50": 0.051011000095968484, "p90": 0.05146100011188537, "mean": 0.05368720007936645, "iqr": 0.0007099999947968172, "raw_times": [0.05075100011708855, 0.05021099991608935, 0.06500200015580049, 0.051011000095968484, 0.05146100011188537], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05477099989548151, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04905100013274932, "p50": 0.0506710000536259, "p90": 0.05074099999546888, "mean": 0.05034520004301157, "iqr": 0.0009190000582748326, "raw_times": [0.04905100013274932, 0.04982199993719405, 0.05074099999546888, 0.0506710000536259, 0.05144100009601971], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05602199985332845, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04956099996888952, "p50": 0.05081099993731186, "p90": 0.05169099995327997, "mean": 0.05088699999760138, "iqr": 0.001219999830937013, "raw_times": [0.04956099996888952, 0.05081099993731186, 0.05169099995327997, 0.05047100012234296, 0.051901000006182585], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09174200022243895, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04988099999536644, "p50": 0.05099100008010282, "p90": 0.05105099990032613, "mean": 0.052221000032659504, "iqr": 0.0010199998996540671, "raw_times": [0.050031000000672066, 0.05105099990032613, 0.05915100018683006, 0.04988099999536644, 0.05099100008010282], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05531200008590531, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05021099991608935, "p50": 0.051081000037811464, "p90": 0.05169199994270457, "mean": 0.0512771999638062, "iqr": 0.0008909998996387003, "raw_times": [0.05169199994270457, 0.05080100004306587, 0.051081000037811464, 0.05021099991608935, 0.05260099987935973], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05523100003301806, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05031099999541766, "p50": 0.05089100000077451, "p90": 0.05121100002725143, "mean": 0.05099300005895202, "iqr": 0.0007899998308857903, "raw_times": [0.05031099999541766, 0.05121100002725143, 0.05213100007495086, 0.05042100019636564, 0.05089100000077451], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05383200004871469, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04998199983674567, "p50": 0.05088100010652852, "p90": 0.05478100001710118, "mean": 0.05373740004870342, "iqr": 0.004679999847212457, "raw_times": [0.04998199983674567, 0.05478100001710118, 0.05010100016988872, 0.05088100010652852, 0.06294200011325302], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055170999985421076, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0490120000904426, "p50": 0.050441999974282226, "p90": 0.0519509999321599, "mean": 0.05078939998384158, "iqr": 0.0017099998785852222, "raw_times": [0.0490120000904426, 0.05230099986874848, 0.050441999974282226, 0.05024100005357468, 0.0519509999321599], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.055800999916755245, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04950099992129253, "p50": 0.050521000048320275, "p90": 0.05074099999546888, "mean": 0.05043319997639628, "iqr": 0.0008590000106778461, "raw_times": [0.04988199998479104, 0.05152099993210868, 0.050521000048320275, 0.05074099999546888, 0.04950099992129253], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054772000112279784, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049130999968838296, "p50": 0.05138200003784732, "p90": 0.05160199998499593, "mean": 0.05141159999766387, "iqr": 0.0003800000740739051, "raw_times": [0.049130999968838296, 0.05372100008571579, 0.05122199991092202, 0.05160199998499593, 0.05138200003784732], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05565100013882329, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05034100013290299, "p50": 0.05126099995322875, "p90": 0.051630999905682984, "mean": 0.051170999995520106, "iqr": 0.0007999999525054591, "raw_times": [0.05126099995322875, 0.05034100013290299, 0.051630999905682984, 0.05179100003260828, 0.050830999953177525], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05449200011753419, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05069099984211789, "p50": 0.05091200000606477, "p90": 0.05127099984747474, "mean": 0.07049399991956307, "iqr": 0.00038899997889529914, "raw_times": [0.05069099984211789, 0.1487140000335785, 0.05091200000606477, 0.05088199986857944, 0.05127099984747474], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054521000038221246, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049631000138106174, "p50": 0.05295099981594831, "p90": 0.05334100001164188, "mean": 0.057009199963431456, "iqr": 0.0027099999897473026, "raw_times": [0.049631000138106174, 0.05295099981594831, 0.05334100001164188, 0.05063100002189458, 0.07849199982956634], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05502199996954005, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0506710000536259, "p50": 0.05091100001664017, "p90": 0.051161999863325036, "mean": 0.0510071999997308, "iqr": 0.00031099989428184927, "raw_times": [0.05091100001664017, 0.0506710000536259, 0.05085099996904319, 0.05144100009601971, 0.051161999863325036], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.057221000133722555, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
- {"ts": "2025-12-19T19:55:08Z", "run": "459614c9d5414527a9837368a3b932b3", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04936100003760657, "p50": 0.050112000053559314, "p90": 0.05135099991093739, "mean": 0.050513199994384195, "iqr": 0.0015199998415482696, "raw_times": [0.04936100003760657, 0.05135099991093739, 0.04983100006938912, 0.05191099990042858, 0.050112000053559314], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05625199992209673, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
 
1
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W2", "batch": 2, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04374100012682902, "p50": 0.046281000095405034, "p90": 0.04699100009020185, "mean": 0.04588520009747299, "iqr": 0.0018789999103319133, "raw_times": [0.0473009999950591, 0.04699100009020185, 0.04511200017986994, 0.046281000095405034, 0.04374100012682902], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054351000017049955, "peak_bytes": 295936, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
2
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S128_W4", "batch": 2, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.05076099978396087, "p50": 0.05160099999557133, "p90": 0.0517210000907653, "mean": 0.05158899998605193, "iqr": 0.00034000004234258085, "raw_times": [0.05076099978396087, 0.052481000011539436, 0.05138100004842272, 0.0517210000907653, 0.05160099999557133], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.06265199999688775, "peak_bytes": 296448, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
3
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W2", "batch": 2, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04984099996363511, "p50": 0.05232199987403874, "p90": 0.05389100010688708, "mean": 0.05356520000532328, "iqr": 0.0016999999843392288, "raw_times": [0.04984099996363511, 0.05389100010688708, 0.05232199987403874, 0.05958099995950761, 0.05219100012254785], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05472199995892879, "peak_bytes": 1180672, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
4
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S512_W4", "batch": 2, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.050031999990096665, "p50": 0.0509510000483715, "p90": 0.0512909998633404, "mean": 0.05097519997434574, "iqr": 0.0009099996987060877, "raw_times": [0.050381000164634315, 0.0512909998633404, 0.05222099980528583, 0.0509510000483715, 0.050031999990096665], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05585200005953084, "peak_bytes": 1181184, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
5
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W2", "batch": 2, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048549999974056846, "p50": 0.04953099983140419, "p90": 0.04999000020688982, "mean": 0.04990460001863539, "iqr": 0.0006790000952605624, "raw_times": [0.04953099983140419, 0.052140999969196855, 0.04999000020688982, 0.049311000111629255, 0.048549999974056846], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054900999884921475, "peak_bytes": 4719616, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
6
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D64_S2048_W4", "batch": 2, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04891100002168969, "p50": 0.04958099998475518, "p90": 0.05115099997965444, "mean": 0.05000500000278407, "iqr": 0.002170000016121776, "raw_times": [0.04891100002168969, 0.05140100006428838, 0.04958099998475518, 0.05115099997965444, 0.04898099996353267], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05376100011744711, "peak_bytes": 4720128, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
7
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W2", "batch": 2, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04850100003750413, "p50": 0.04982199993719405, "p90": 0.04984099996363511, "mean": 0.04968119997101894, "iqr": 0.0003499999365885742, "raw_times": [0.04850100003750413, 0.04984099996363511, 0.04949100002704654, 0.050750999889714876, 0.04982199993719405], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054501000022355583, "peak_bytes": 9461760, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
8
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S128_W4", "batch": 2, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049300999990009586, "p50": 0.04983100006938912, "p90": 0.050170999884358025, "mean": 0.05779919997621619, "iqr": 0.0004900000476482091, "raw_times": [0.09001200010061439, 0.04983100006938912, 0.049680999836709816, 0.050170999884358025, 0.049300999990009586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052752000101463636, "peak_bytes": 9478144, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
9
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W2", "batch": 2, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047761000132595655, "p50": 0.04934100002174091, "p90": 0.049350999915986904, "mean": 0.0488690000565839, "iqr": 0.0008099998467514524, "raw_times": [0.04854100006923545, 0.04935100014336058, 0.049350999915986904, 0.04934100002174091, 0.047761000132595655], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.052481000011539436, "peak_bytes": 37773312, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
10
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S512_W4", "batch": 2, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048450999884153134, "p50": 0.04965099992659816, "p90": 0.050621000127648585, "mean": 0.05171900002096663, "iqr": 0.0015900000107649248, "raw_times": [0.048450999884153134, 0.04903100011688366, 0.04965099992659816, 0.050621000127648585, 0.06084100004954962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053670999932364793, "peak_bytes": 37789696, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
11
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W2", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047580999989804695, "p50": 0.0487410000005184, "p90": 0.050201000021843356, "mean": 0.04913100001431303, "iqr": 0.002320000021427404, "raw_times": [0.047580999989804695, 0.050201000021843356, 0.051251000058982754, 0.04788100000041595, 0.0487410000005184], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054761000001235516, "peak_bytes": 151019520, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
12
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_D2048_S2048_W4", "batch": 2, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049270999852524255, "p50": 0.049540999953023857, "p90": 0.04967099994246382, "mean": 0.049752999939300935, "iqr": 0.0003000000106112566, "raw_times": [0.049370999931852566, 0.049540999953023857, 0.049270999852524255, 0.05091100001664017, 0.04967099994246382], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053771000011693104, "peak_bytes": 151035904, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
13
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W2", "batch": 4, "dim": 64, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04857099997934711, "p50": 0.04948099990542687, "p90": 0.049690999958329485, "mean": 0.04942899995512562, "iqr": 0.0004900000476482091, "raw_times": [0.04857099997934711, 0.04948099990542687, 0.049690999958329485, 0.049200999910681276, 0.050201000021843356], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05436199990072055, "peak_bytes": 33727488, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
14
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S128_W4", "batch": 4, "dim": 64, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048160999995161546, "p50": 0.049621000016486505, "p90": 0.05022099981033534, "mean": 0.04960719993505336, "iqr": 0.0007689998255955288, "raw_times": [0.048160999995161546, 0.049621000016486505, 0.05022099981033534, 0.049451999984739814, 0.050580999868543586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053051000122650294, "peak_bytes": 591360, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
15
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W2", "batch": 4, "dim": 64, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04980199992132839, "p50": 0.05055099995843193, "p90": 0.050551000185805606, "mean": 0.05498319997059298, "iqr": 0.00047000025915622246, "raw_times": [0.07393099986074958, 0.05008099992664938, 0.05055099995843193, 0.050551000185805606, 0.04980199992132839], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053621999995812075, "peak_bytes": 2360320, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
16
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S512_W4", "batch": 4, "dim": 64, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.048960999947667005, "p50": 0.05131100010657974, "p90": 0.052661000154330395, "mean": 0.051906999988204916, "iqr": 0.0029200002700235927, "raw_times": [0.048960999947667005, 0.052661000154330395, 0.056860999848140636, 0.0497409998843068, 0.05131100010657974], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.053932000128043, "peak_bytes": 2360832, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
17
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W2", "batch": 4, "dim": 64, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04804100012734125, "p50": 0.04966200003764243, "p90": 0.05004099989491806, "mean": 0.04977120001967705, "iqr": 0.0007399999049084727, "raw_times": [0.04804100012734125, 0.05181100004847394, 0.04966200003764243, 0.05004099989491806, 0.049300999990009586], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05391199988480366, "peak_bytes": 9438208, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
18
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D64_S2048_W4", "batch": 4, "dim": 64, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.047730999995110324, "p50": 0.04978099991603813, "p90": 0.049860999979500775, "mean": 0.04935500001010951, "iqr": 0.0009599998520570807, "raw_times": [0.047730999995110324, 0.04978099991603813, 0.05050100003245461, 0.048901000127443695, 0.049860999979500775], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0537809999059391, "peak_bytes": 9438720, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
19
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W2", "batch": 4, "dim": 2048, "seqlen": 128, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.049270999852524255, "p50": 0.05024100005357468, "p90": 0.05084099984742352, "mean": 0.050164999993285164, "iqr": 0.0012699997569143306, "raw_times": [0.049270999852524255, 0.05024100005357468, 0.05084099984742352, 0.05090100012239418, 0.04957100009050919], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05270199994811264, "peak_bytes": 18931712, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
20
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S128_W4", "batch": 4, "dim": 2048, "seqlen": 128, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04698099996858218, "p50": 0.049690999958329485, "p90": 0.04984100019100879, "mean": 0.049111000043922104, "iqr": 0.00083000008999079, "raw_times": [0.04698099996858218, 0.04984100019100879, 0.050031000000672066, 0.049011000101018, 0.049690999958329485], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05244099997980811, "peak_bytes": 18948096, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
21
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W2", "batch": 4, "dim": 2048, "seqlen": 512, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04767199993693794, "p50": 0.049621000016486505, "p90": 0.04999099996894074, "mean": 0.049529399984749034, "iqr": 0.0007789999472151976, "raw_times": [0.04767199993693794, 0.049621000016486505, 0.05115099997965444, 0.04999099996894074, 0.049212000021725544], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05385099984778208, "peak_bytes": 75522048, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
22
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S512_W4", "batch": 4, "dim": 2048, "seqlen": 512, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04878100003224972, "p50": 0.049651000153971836, "p90": 0.050100999942515045, "mean": 0.04970100003447442, "iqr": 0.0005699998837371822, "raw_times": [0.04878100003224972, 0.050440999984857626, 0.049651000153971836, 0.04953100005877786, 0.050100999942515045], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.054691000059392536, "peak_bytes": 75538432, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
23
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W2", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 2, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04801099998985592, "p50": 0.049041000011129654, "p90": 0.04916099987894995, "mean": 0.04916500001854729, "iqr": 0.00038999974094622303, "raw_times": [0.04801099998985592, 0.050841000074797194, 0.04916099987894995, 0.04877100013800373, 0.049041000011129654], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05514100007530942, "peak_bytes": 302014464, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
24
+ {"ts": "2025-12-19T23:00:51Z", "run": "741fa3d3d3d540d1b123377179e6fd21", "impl": "hf_kernels_causal_conv1d", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B4_D2048_S2048_W4", "batch": 4, "dim": 2048, "seqlen": 2048, "width": 4, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04843099986828747, "p50": 0.04957099986313551, "p90": 0.04984099996363511, "mean": 0.04942899995512562, "iqr": 0.00046999980440887157, "raw_times": [0.04843099986828747, 0.04937100015922624, 0.049930999921343755, 0.04957099986313551, 0.04984099996363511], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.05356199994821509, "peak_bytes": 302030848, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 0.003, "atol": 0.005, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "causal_conv1d_fp32"}, "err": null}
causal_conv1d/impls/hf_kernels_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/impls/torch_causal_conv1d.html CHANGED
The diff for this file is too large to render. See raw diff
 
causal_conv1d/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: a2617076455d3985f32d3652d376c64caac8acd7513e105352d8ccd515d5c005
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB

Git LFS Details

  • SHA256: 2ee8e4503bfdd426f73797bc1dc8282f57f594087b5fe7c44c74d67c14a07ba6
  • Pointer size: 130 Bytes
  • Size of remote file: 35.4 kB
causal_conv1d/results/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:55:43.820965</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -4233,70 +4233,70 @@ body[data-tool="eraser"] .main-content {
4233
  <g id="matplotlib.axis_2">
4234
  <g id="ytick_1">
4235
  <g id="grid-y--2" class="grid grid-y">
4236
- <path d="M 47.72 375.22161 L 831.034248 375.22161 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_25">
4239
  <defs>
4240
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4241
  </defs>
4242
  <g>
4243
- <use ns4:href="#m0fca2865ba" x="47.72" y="375.22161" style="stroke: #000000; stroke-width: 0.8" />
4244
  </g>
4245
  </g>
4246
  <g id="text_25">
4247
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.020828" transform="rotate(-0 40.72 379.020828)">0.1</text>
4248
  </g>
4249
  </g>
4250
  <g id="ytick_2">
4251
  <g id="grid-y--3" class="grid grid-y">
4252
- <path d="M 47.72 292.730166 L 831.034248 292.730166 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4253
  </g>
4254
  <g id="line2d_26">
4255
  <g>
4256
- <use ns4:href="#m0fca2865ba" x="47.72" y="292.730166" style="stroke: #000000; stroke-width: 0.8" />
4257
  </g>
4258
  </g>
4259
  <g id="text_26">
4260
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.529385" transform="rotate(-0 40.72 296.529385)">0.2</text>
4261
  </g>
4262
  </g>
4263
  <g id="ytick_3">
4264
  <g id="grid-y--4" class="grid grid-y">
4265
- <path d="M 47.72 210.238722 L 831.034248 210.238722 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4266
  </g>
4267
  <g id="line2d_27">
4268
  <g>
4269
- <use ns4:href="#m0fca2865ba" x="47.72" y="210.238722" style="stroke: #000000; stroke-width: 0.8" />
4270
  </g>
4271
  </g>
4272
  <g id="text_27">
4273
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="214.037941" transform="rotate(-0 40.72 214.037941)">0.3</text>
4274
  </g>
4275
  </g>
4276
  <g id="ytick_4">
4277
  <g id="grid-y--5" class="grid grid-y">
4278
- <path d="M 47.72 127.747279 L 831.034248 127.747279 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4279
  </g>
4280
  <g id="line2d_28">
4281
  <g>
4282
- <use ns4:href="#m0fca2865ba" x="47.72" y="127.747279" style="stroke: #000000; stroke-width: 0.8" />
4283
  </g>
4284
  </g>
4285
  <g id="text_28">
4286
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="131.546498" transform="rotate(-0 40.72 131.546498)">0.4</text>
4287
  </g>
4288
  </g>
4289
  <g id="ytick_5">
4290
  <g id="grid-y--6" class="grid grid-y">
4291
- <path d="M 47.72 45.255835 L 831.034248 45.255835 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4292
  </g>
4293
  <g id="line2d_29">
4294
  <g>
4295
- <use ns4:href="#m0fca2865ba" x="47.72" y="45.255835" style="stroke: #000000; stroke-width: 0.8" />
4296
  </g>
4297
  </g>
4298
  <g id="text_29">
4299
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="49.055054" transform="rotate(-0 40.72 49.055054)">0.5</text>
4300
  </g>
4301
  </g>
4302
  <g id="label--y" class="ylabel">
@@ -4304,66 +4304,66 @@ body[data-tool="eraser"] .main-content {
4304
  </g>
4305
  </g>
4306
  <g id="series--hf-kernels-causal-conv1d" class="series">
4307
- <path d="M 83.325193 420.186871 L 114.286231 413.982689 L 145.247268 414.824927 L 176.208306 414.222739 L 207.169343 415.130145 L 238.130381 415.088899 L 269.091418 415.65809 L 300.052455 416.614991 L 331.013493 415.971558 L 361.97453 415.633343 L 392.935568 415.913814 L 423.896605 415.798326 L 454.857643 415.649841 L 485.81868 415.575599 L 516.779718 415.732333 L 547.740755 415.740582 L 578.701793 416.102719 L 609.66283 416.037551 L 640.623868 415.3273 L 671.584905 415.427114 L 702.545943 415.715009 L 733.50698 414.033009 L 764.468018 415.715834 L 795.429055 416.374941 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4308
  <defs>
4309
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4310
  </defs>
4311
  <g clip-path="url(#pb49fc4c8d2)">
4312
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4313
- <use ns4:href="#md7efaf3aec" x="114.286231" y="413.982689" style="fill: #1f77b4; stroke: #1f77b4" />
4314
- <use ns4:href="#md7efaf3aec" x="145.247268" y="414.824927" style="fill: #1f77b4; stroke: #1f77b4" />
4315
- <use ns4:href="#md7efaf3aec" x="176.208306" y="414.222739" style="fill: #1f77b4; stroke: #1f77b4" />
4316
- <use ns4:href="#md7efaf3aec" x="207.169343" y="415.130145" style="fill: #1f77b4; stroke: #1f77b4" />
4317
- <use ns4:href="#md7efaf3aec" x="238.130381" y="415.088899" style="fill: #1f77b4; stroke: #1f77b4" />
4318
- <use ns4:href="#md7efaf3aec" x="269.091418" y="415.65809" style="fill: #1f77b4; stroke: #1f77b4" />
4319
- <use ns4:href="#md7efaf3aec" x="300.052455" y="416.614991" style="fill: #1f77b4; stroke: #1f77b4" />
4320
- <use ns4:href="#md7efaf3aec" x="331.013493" y="415.971558" style="fill: #1f77b4; stroke: #1f77b4" />
4321
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.633343" style="fill: #1f77b4; stroke: #1f77b4" />
4322
- <use ns4:href="#md7efaf3aec" x="392.935568" y="415.913814" style="fill: #1f77b4; stroke: #1f77b4" />
4323
- <use ns4:href="#md7efaf3aec" x="423.896605" y="415.798326" style="fill: #1f77b4; stroke: #1f77b4" />
4324
- <use ns4:href="#md7efaf3aec" x="454.857643" y="415.649841" style="fill: #1f77b4; stroke: #1f77b4" />
4325
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.575599" style="fill: #1f77b4; stroke: #1f77b4" />
4326
- <use ns4:href="#md7efaf3aec" x="516.779718" y="415.732333" style="fill: #1f77b4; stroke: #1f77b4" />
4327
- <use ns4:href="#md7efaf3aec" x="547.740755" y="415.740582" style="fill: #1f77b4; stroke: #1f77b4" />
4328
- <use ns4:href="#md7efaf3aec" x="578.701793" y="416.102719" style="fill: #1f77b4; stroke: #1f77b4" />
4329
- <use ns4:href="#md7efaf3aec" x="609.66283" y="416.037551" style="fill: #1f77b4; stroke: #1f77b4" />
4330
- <use ns4:href="#md7efaf3aec" x="640.623868" y="415.3273" style="fill: #1f77b4; stroke: #1f77b4" />
4331
- <use ns4:href="#md7efaf3aec" x="671.584905" y="415.427114" style="fill: #1f77b4; stroke: #1f77b4" />
4332
- <use ns4:href="#md7efaf3aec" x="702.545943" y="415.715009" style="fill: #1f77b4; stroke: #1f77b4" />
4333
- <use ns4:href="#md7efaf3aec" x="733.50698" y="414.033009" style="fill: #1f77b4; stroke: #1f77b4" />
4334
- <use ns4:href="#md7efaf3aec" x="764.468018" y="415.715834" style="fill: #1f77b4; stroke: #1f77b4" />
4335
- <use ns4:href="#md7efaf3aec" x="795.429055" y="416.374941" style="fill: #1f77b4; stroke: #1f77b4" />
4336
  </g>
4337
  </g>
4338
  <g id="series--torch-eager" class="series">
4339
- <path d="M 83.325193 398.136083 L 114.286231 388.072127 L 145.247268 387.931066 L 176.208306 388.641318 L 207.169343 388.731233 L 238.130381 388.599247 L 269.091418 390.224329 L 300.052455 389.556973 L 331.013493 389.416737 L 361.97453 388.913539 L 392.935568 326.565682 L 423.896605 322.943482 L 454.857643 389.985928 L 485.81868 389.721956 L 516.779718 389.763201 L 547.740755 389.433236 L 578.701793 390.282897 L 609.66283 388.765055 L 640.623868 388.979533 L 671.584905 389.457983 L 702.545943 380.730388 L 733.50698 375.598596 L 764.468018 57.586656 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4340
  <defs>
4341
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4342
  </defs>
4343
  <g clip-path="url(#pb49fc4c8d2)">
4344
- <use ns4:href="#m9b8c54d372" x="83.325193" y="398.136083" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
- <use ns4:href="#m9b8c54d372" x="114.286231" y="388.072127" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
- <use ns4:href="#m9b8c54d372" x="145.247268" y="387.931066" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
- <use ns4:href="#m9b8c54d372" x="176.208306" y="388.641318" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
- <use ns4:href="#m9b8c54d372" x="207.169343" y="388.731233" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
- <use ns4:href="#m9b8c54d372" x="238.130381" y="388.599247" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
- <use ns4:href="#m9b8c54d372" x="269.091418" y="390.224329" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
- <use ns4:href="#m9b8c54d372" x="300.052455" y="389.556973" style="fill: #ff7f0e; stroke: #ff7f0e" />
4352
- <use ns4:href="#m9b8c54d372" x="331.013493" y="389.416737" style="fill: #ff7f0e; stroke: #ff7f0e" />
4353
- <use ns4:href="#m9b8c54d372" x="361.97453" y="388.913539" style="fill: #ff7f0e; stroke: #ff7f0e" />
4354
- <use ns4:href="#m9b8c54d372" x="392.935568" y="326.565682" style="fill: #ff7f0e; stroke: #ff7f0e" />
4355
- <use ns4:href="#m9b8c54d372" x="423.896605" y="322.943482" style="fill: #ff7f0e; stroke: #ff7f0e" />
4356
- <use ns4:href="#m9b8c54d372" x="454.857643" y="389.985928" style="fill: #ff7f0e; stroke: #ff7f0e" />
4357
- <use ns4:href="#m9b8c54d372" x="485.81868" y="389.721956" style="fill: #ff7f0e; stroke: #ff7f0e" />
4358
- <use ns4:href="#m9b8c54d372" x="516.779718" y="389.763201" style="fill: #ff7f0e; stroke: #ff7f0e" />
4359
- <use ns4:href="#m9b8c54d372" x="547.740755" y="389.433236" style="fill: #ff7f0e; stroke: #ff7f0e" />
4360
- <use ns4:href="#m9b8c54d372" x="578.701793" y="390.282897" style="fill: #ff7f0e; stroke: #ff7f0e" />
4361
- <use ns4:href="#m9b8c54d372" x="609.66283" y="388.765055" style="fill: #ff7f0e; stroke: #ff7f0e" />
4362
- <use ns4:href="#m9b8c54d372" x="640.623868" y="388.979533" style="fill: #ff7f0e; stroke: #ff7f0e" />
4363
- <use ns4:href="#m9b8c54d372" x="671.584905" y="389.457983" style="fill: #ff7f0e; stroke: #ff7f0e" />
4364
- <use ns4:href="#m9b8c54d372" x="702.545943" y="380.730388" style="fill: #ff7f0e; stroke: #ff7f0e" />
4365
- <use ns4:href="#m9b8c54d372" x="733.50698" y="375.598596" style="fill: #ff7f0e; stroke: #ff7f0e" />
4366
- <use ns4:href="#m9b8c54d372" x="764.468018" y="57.586656" style="fill: #ff7f0e; stroke: #ff7f0e" />
4367
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4368
  </g>
4369
  </g>
@@ -4422,7 +4422,7 @@ body[data-tool="eraser"] .main-content {
4422
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4423
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4424
  </span> |
4425
- Cell: combine | 4.64s
4426
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4427
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4428
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4547,7 +4547,7 @@ torch_eager cuda_B2_D64_S512_W2 0.08 True
4547
  torch_eager cuda_B2_D64_S512_W4 0.08 True
4548
  torch_eager cuda_B4_D2048_S128_W2 0.08 True
4549
  torch_eager cuda_B4_D2048_S128_W4 0.08 True
4550
- torch_eager cuda_B4_D2048_S2048_W2 0.49 True
4551
  torch_eager cuda_B4_D2048_S2048_W4 0.50 True
4552
  torch_eager cuda_B4_D2048_S512_W2 0.09 True
4553
  torch_eager cuda_B4_D2048_S512_W4 0.10 True
@@ -4576,7 +4576,7 @@ Implementations included:
4576
  <div class="uv-install-logs" id="uv-logs-combine">
4577
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4578
  <div class="uv-logs-content" style="display: none;">
4579
- Installed 37 packages in 204ms
4580
  </div>
4581
  </div>
4582
  <div class="cell-artifacts">
@@ -4589,7 +4589,7 @@ Installed 37 packages in 204ms
4589
  <rdf:RDF>
4590
  <ns2:Work>
4591
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4592
- <dc:date>2025-12-19T19:55:43.820965</dc:date>
4593
  <dc:format>image/svg+xml</dc:format>
4594
  <dc:creator>
4595
  <ns2:Agent>
@@ -4933,70 +4933,70 @@ Installed 37 packages in 204ms
4933
  <g id="matplotlib.axis_2">
4934
  <g id="ytick_1">
4935
  <g id="grid-y--2" class="grid grid-y">
4936
- <path d="M 47.72 375.22161 L 831.034248 375.22161 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4937
  </g>
4938
  <g id="line2d_25">
4939
  <defs>
4940
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4941
  </defs>
4942
  <g>
4943
- <use ns4:href="#m0fca2865ba" x="47.72" y="375.22161" style="stroke: #000000; stroke-width: 0.8" />
4944
  </g>
4945
  </g>
4946
  <g id="text_25">
4947
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.020828" transform="rotate(-0 40.72 379.020828)">0.1</text>
4948
  </g>
4949
  </g>
4950
  <g id="ytick_2">
4951
  <g id="grid-y--3" class="grid grid-y">
4952
- <path d="M 47.72 292.730166 L 831.034248 292.730166 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4953
  </g>
4954
  <g id="line2d_26">
4955
  <g>
4956
- <use ns4:href="#m0fca2865ba" x="47.72" y="292.730166" style="stroke: #000000; stroke-width: 0.8" />
4957
  </g>
4958
  </g>
4959
  <g id="text_26">
4960
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.529385" transform="rotate(-0 40.72 296.529385)">0.2</text>
4961
  </g>
4962
  </g>
4963
  <g id="ytick_3">
4964
  <g id="grid-y--4" class="grid grid-y">
4965
- <path d="M 47.72 210.238722 L 831.034248 210.238722 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4966
  </g>
4967
  <g id="line2d_27">
4968
  <g>
4969
- <use ns4:href="#m0fca2865ba" x="47.72" y="210.238722" style="stroke: #000000; stroke-width: 0.8" />
4970
  </g>
4971
  </g>
4972
  <g id="text_27">
4973
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="214.037941" transform="rotate(-0 40.72 214.037941)">0.3</text>
4974
  </g>
4975
  </g>
4976
  <g id="ytick_4">
4977
  <g id="grid-y--5" class="grid grid-y">
4978
- <path d="M 47.72 127.747279 L 831.034248 127.747279 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4979
  </g>
4980
  <g id="line2d_28">
4981
  <g>
4982
- <use ns4:href="#m0fca2865ba" x="47.72" y="127.747279" style="stroke: #000000; stroke-width: 0.8" />
4983
  </g>
4984
  </g>
4985
  <g id="text_28">
4986
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="131.546498" transform="rotate(-0 40.72 131.546498)">0.4</text>
4987
  </g>
4988
  </g>
4989
  <g id="ytick_5">
4990
  <g id="grid-y--6" class="grid grid-y">
4991
- <path d="M 47.72 45.255835 L 831.034248 45.255835 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4992
  </g>
4993
  <g id="line2d_29">
4994
  <g>
4995
- <use ns4:href="#m0fca2865ba" x="47.72" y="45.255835" style="stroke: #000000; stroke-width: 0.8" />
4996
  </g>
4997
  </g>
4998
  <g id="text_29">
4999
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="49.055054" transform="rotate(-0 40.72 49.055054)">0.5</text>
5000
  </g>
5001
  </g>
5002
  <g id="label--y" class="ylabel">
@@ -5004,66 +5004,66 @@ Installed 37 packages in 204ms
5004
  </g>
5005
  </g>
5006
  <g id="series--hf-kernels-causal-conv1d" class="series">
5007
- <path d="M 83.325193 420.186871 L 114.286231 413.982689 L 145.247268 414.824927 L 176.208306 414.222739 L 207.169343 415.130145 L 238.130381 415.088899 L 269.091418 415.65809 L 300.052455 416.614991 L 331.013493 415.971558 L 361.97453 415.633343 L 392.935568 415.913814 L 423.896605 415.798326 L 454.857643 415.649841 L 485.81868 415.575599 L 516.779718 415.732333 L 547.740755 415.740582 L 578.701793 416.102719 L 609.66283 416.037551 L 640.623868 415.3273 L 671.584905 415.427114 L 702.545943 415.715009 L 733.50698 414.033009 L 764.468018 415.715834 L 795.429055 416.374941 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5008
  <defs>
5009
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5010
  </defs>
5011
  <g clip-path="url(#pb49fc4c8d2)">
5012
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
5013
- <use ns4:href="#md7efaf3aec" x="114.286231" y="413.982689" style="fill: #1f77b4; stroke: #1f77b4" />
5014
- <use ns4:href="#md7efaf3aec" x="145.247268" y="414.824927" style="fill: #1f77b4; stroke: #1f77b4" />
5015
- <use ns4:href="#md7efaf3aec" x="176.208306" y="414.222739" style="fill: #1f77b4; stroke: #1f77b4" />
5016
- <use ns4:href="#md7efaf3aec" x="207.169343" y="415.130145" style="fill: #1f77b4; stroke: #1f77b4" />
5017
- <use ns4:href="#md7efaf3aec" x="238.130381" y="415.088899" style="fill: #1f77b4; stroke: #1f77b4" />
5018
- <use ns4:href="#md7efaf3aec" x="269.091418" y="415.65809" style="fill: #1f77b4; stroke: #1f77b4" />
5019
- <use ns4:href="#md7efaf3aec" x="300.052455" y="416.614991" style="fill: #1f77b4; stroke: #1f77b4" />
5020
- <use ns4:href="#md7efaf3aec" x="331.013493" y="415.971558" style="fill: #1f77b4; stroke: #1f77b4" />
5021
- <use ns4:href="#md7efaf3aec" x="361.97453" y="415.633343" style="fill: #1f77b4; stroke: #1f77b4" />
5022
- <use ns4:href="#md7efaf3aec" x="392.935568" y="415.913814" style="fill: #1f77b4; stroke: #1f77b4" />
5023
- <use ns4:href="#md7efaf3aec" x="423.896605" y="415.798326" style="fill: #1f77b4; stroke: #1f77b4" />
5024
- <use ns4:href="#md7efaf3aec" x="454.857643" y="415.649841" style="fill: #1f77b4; stroke: #1f77b4" />
5025
- <use ns4:href="#md7efaf3aec" x="485.81868" y="415.575599" style="fill: #1f77b4; stroke: #1f77b4" />
5026
- <use ns4:href="#md7efaf3aec" x="516.779718" y="415.732333" style="fill: #1f77b4; stroke: #1f77b4" />
5027
- <use ns4:href="#md7efaf3aec" x="547.740755" y="415.740582" style="fill: #1f77b4; stroke: #1f77b4" />
5028
- <use ns4:href="#md7efaf3aec" x="578.701793" y="416.102719" style="fill: #1f77b4; stroke: #1f77b4" />
5029
- <use ns4:href="#md7efaf3aec" x="609.66283" y="416.037551" style="fill: #1f77b4; stroke: #1f77b4" />
5030
- <use ns4:href="#md7efaf3aec" x="640.623868" y="415.3273" style="fill: #1f77b4; stroke: #1f77b4" />
5031
- <use ns4:href="#md7efaf3aec" x="671.584905" y="415.427114" style="fill: #1f77b4; stroke: #1f77b4" />
5032
- <use ns4:href="#md7efaf3aec" x="702.545943" y="415.715009" style="fill: #1f77b4; stroke: #1f77b4" />
5033
- <use ns4:href="#md7efaf3aec" x="733.50698" y="414.033009" style="fill: #1f77b4; stroke: #1f77b4" />
5034
- <use ns4:href="#md7efaf3aec" x="764.468018" y="415.715834" style="fill: #1f77b4; stroke: #1f77b4" />
5035
- <use ns4:href="#md7efaf3aec" x="795.429055" y="416.374941" style="fill: #1f77b4; stroke: #1f77b4" />
5036
  </g>
5037
  </g>
5038
  <g id="series--torch-eager" class="series">
5039
- <path d="M 83.325193 398.136083 L 114.286231 388.072127 L 145.247268 387.931066 L 176.208306 388.641318 L 207.169343 388.731233 L 238.130381 388.599247 L 269.091418 390.224329 L 300.052455 389.556973 L 331.013493 389.416737 L 361.97453 388.913539 L 392.935568 326.565682 L 423.896605 322.943482 L 454.857643 389.985928 L 485.81868 389.721956 L 516.779718 389.763201 L 547.740755 389.433236 L 578.701793 390.282897 L 609.66283 388.765055 L 640.623868 388.979533 L 671.584905 389.457983 L 702.545943 380.730388 L 733.50698 375.598596 L 764.468018 57.586656 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5040
  <defs>
5041
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5042
  </defs>
5043
  <g clip-path="url(#pb49fc4c8d2)">
5044
- <use ns4:href="#m9b8c54d372" x="83.325193" y="398.136083" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
- <use ns4:href="#m9b8c54d372" x="114.286231" y="388.072127" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
- <use ns4:href="#m9b8c54d372" x="145.247268" y="387.931066" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
- <use ns4:href="#m9b8c54d372" x="176.208306" y="388.641318" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
- <use ns4:href="#m9b8c54d372" x="207.169343" y="388.731233" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
- <use ns4:href="#m9b8c54d372" x="238.130381" y="388.599247" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
- <use ns4:href="#m9b8c54d372" x="269.091418" y="390.224329" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
- <use ns4:href="#m9b8c54d372" x="300.052455" y="389.556973" style="fill: #ff7f0e; stroke: #ff7f0e" />
5052
- <use ns4:href="#m9b8c54d372" x="331.013493" y="389.416737" style="fill: #ff7f0e; stroke: #ff7f0e" />
5053
- <use ns4:href="#m9b8c54d372" x="361.97453" y="388.913539" style="fill: #ff7f0e; stroke: #ff7f0e" />
5054
- <use ns4:href="#m9b8c54d372" x="392.935568" y="326.565682" style="fill: #ff7f0e; stroke: #ff7f0e" />
5055
- <use ns4:href="#m9b8c54d372" x="423.896605" y="322.943482" style="fill: #ff7f0e; stroke: #ff7f0e" />
5056
- <use ns4:href="#m9b8c54d372" x="454.857643" y="389.985928" style="fill: #ff7f0e; stroke: #ff7f0e" />
5057
- <use ns4:href="#m9b8c54d372" x="485.81868" y="389.721956" style="fill: #ff7f0e; stroke: #ff7f0e" />
5058
- <use ns4:href="#m9b8c54d372" x="516.779718" y="389.763201" style="fill: #ff7f0e; stroke: #ff7f0e" />
5059
- <use ns4:href="#m9b8c54d372" x="547.740755" y="389.433236" style="fill: #ff7f0e; stroke: #ff7f0e" />
5060
- <use ns4:href="#m9b8c54d372" x="578.701793" y="390.282897" style="fill: #ff7f0e; stroke: #ff7f0e" />
5061
- <use ns4:href="#m9b8c54d372" x="609.66283" y="388.765055" style="fill: #ff7f0e; stroke: #ff7f0e" />
5062
- <use ns4:href="#m9b8c54d372" x="640.623868" y="388.979533" style="fill: #ff7f0e; stroke: #ff7f0e" />
5063
- <use ns4:href="#m9b8c54d372" x="671.584905" y="389.457983" style="fill: #ff7f0e; stroke: #ff7f0e" />
5064
- <use ns4:href="#m9b8c54d372" x="702.545943" y="380.730388" style="fill: #ff7f0e; stroke: #ff7f0e" />
5065
- <use ns4:href="#m9b8c54d372" x="733.50698" y="375.598596" style="fill: #ff7f0e; stroke: #ff7f0e" />
5066
- <use ns4:href="#m9b8c54d372" x="764.468018" y="57.586656" style="fill: #ff7f0e; stroke: #ff7f0e" />
5067
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5068
  </g>
5069
  </g>
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T23:02:31.637981</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
4233
  <g id="matplotlib.axis_2">
4234
  <g id="ytick_1">
4235
  <g id="grid-y--2" class="grid grid-y">
4236
+ <path d="M 47.72 375.771468 L 831.034248 375.771468 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4237
  </g>
4238
  <g id="line2d_25">
4239
  <defs>
4240
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4241
  </defs>
4242
  <g>
4243
+ <use ns4:href="#m0fca2865ba" x="47.72" y="375.771468" style="stroke: #000000; stroke-width: 0.8" />
4244
  </g>
4245
  </g>
4246
  <g id="text_25">
4247
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.570687" transform="rotate(-0 40.72 379.570687)">0.1</text>
4248
  </g>
4249
  </g>
4250
  <g id="ytick_2">
4251
  <g id="grid-y--3" class="grid grid-y">
4252
+ <path d="M 47.72 293.090475 L 831.034248 293.090475 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4253
  </g>
4254
  <g id="line2d_26">
4255
  <g>
4256
+ <use ns4:href="#m0fca2865ba" x="47.72" y="293.090475" style="stroke: #000000; stroke-width: 0.8" />
4257
  </g>
4258
  </g>
4259
  <g id="text_26">
4260
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.889693" transform="rotate(-0 40.72 296.889693)">0.2</text>
4261
  </g>
4262
  </g>
4263
  <g id="ytick_3">
4264
  <g id="grid-y--4" class="grid grid-y">
4265
+ <path d="M 47.72 210.409481 L 831.034248 210.409481 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4266
  </g>
4267
  <g id="line2d_27">
4268
  <g>
4269
+ <use ns4:href="#m0fca2865ba" x="47.72" y="210.409481" style="stroke: #000000; stroke-width: 0.8" />
4270
  </g>
4271
  </g>
4272
  <g id="text_27">
4273
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="214.2087" transform="rotate(-0 40.72 214.2087)">0.3</text>
4274
  </g>
4275
  </g>
4276
  <g id="ytick_4">
4277
  <g id="grid-y--5" class="grid grid-y">
4278
+ <path d="M 47.72 127.728488 L 831.034248 127.728488 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4279
  </g>
4280
  <g id="line2d_28">
4281
  <g>
4282
+ <use ns4:href="#m0fca2865ba" x="47.72" y="127.728488" style="stroke: #000000; stroke-width: 0.8" />
4283
  </g>
4284
  </g>
4285
  <g id="text_28">
4286
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="131.527707" transform="rotate(-0 40.72 131.527707)">0.4</text>
4287
  </g>
4288
  </g>
4289
  <g id="ytick_5">
4290
  <g id="grid-y--6" class="grid grid-y">
4291
+ <path d="M 47.72 45.047495 L 831.034248 45.047495 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4292
  </g>
4293
  <g id="line2d_29">
4294
  <g>
4295
+ <use ns4:href="#m0fca2865ba" x="47.72" y="45.047495" style="stroke: #000000; stroke-width: 0.8" />
4296
  </g>
4297
  </g>
4298
  <g id="text_29">
4299
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="48.846713" transform="rotate(-0 40.72 48.846713)">0.5</text>
4300
  </g>
4301
  </g>
4302
  <g id="label--y" class="ylabel">
 
4304
  </g>
4305
  </g>
4306
  <g id="series--hf-kernels-causal-conv1d" class="series">
4307
+ <path d="M 83.325193 420.186871 L 114.286231 415.788242 L 145.247268 415.192112 L 176.208306 416.325668 L 207.169343 417.499739 L 238.130381 417.458398 L 269.091418 417.259137 L 300.052455 417.251695 L 331.013493 417.656832 L 361.97453 417.400521 L 392.935568 418.152918 L 423.896605 417.49147 L 454.857643 417.541079 L 485.81868 417.425325 L 516.779718 416.656392 L 547.740755 416.028017 L 578.701793 417.391426 L 609.66283 417.293036 L 640.623868 416.912703 L 671.584905 417.367449 L 702.545943 417.425325 L 733.50698 417.400521 L 764.468018 417.904875 L 795.429055 417.466666 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4308
  <defs>
4309
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4310
  </defs>
4311
  <g clip-path="url(#pb49fc4c8d2)">
4312
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
4313
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="415.788242" style="fill: #1f77b4; stroke: #1f77b4" />
4314
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="415.192112" style="fill: #1f77b4; stroke: #1f77b4" />
4315
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="416.325668" style="fill: #1f77b4; stroke: #1f77b4" />
4316
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="417.499739" style="fill: #1f77b4; stroke: #1f77b4" />
4317
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="417.458398" style="fill: #1f77b4; stroke: #1f77b4" />
4318
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="417.259137" style="fill: #1f77b4; stroke: #1f77b4" />
4319
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="417.251695" style="fill: #1f77b4; stroke: #1f77b4" />
4320
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="417.656832" style="fill: #1f77b4; stroke: #1f77b4" />
4321
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="417.400521" style="fill: #1f77b4; stroke: #1f77b4" />
4322
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="418.152918" style="fill: #1f77b4; stroke: #1f77b4" />
4323
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="417.49147" style="fill: #1f77b4; stroke: #1f77b4" />
4324
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="417.541079" style="fill: #1f77b4; stroke: #1f77b4" />
4325
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="417.425325" style="fill: #1f77b4; stroke: #1f77b4" />
4326
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="416.656392" style="fill: #1f77b4; stroke: #1f77b4" />
4327
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="416.028017" style="fill: #1f77b4; stroke: #1f77b4" />
4328
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="417.391426" style="fill: #1f77b4; stroke: #1f77b4" />
4329
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="417.293036" style="fill: #1f77b4; stroke: #1f77b4" />
4330
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="416.912703" style="fill: #1f77b4; stroke: #1f77b4" />
4331
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="417.367449" style="fill: #1f77b4; stroke: #1f77b4" />
4332
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="417.425325" style="fill: #1f77b4; stroke: #1f77b4" />
4333
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="417.400521" style="fill: #1f77b4; stroke: #1f77b4" />
4334
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="417.904875" style="fill: #1f77b4; stroke: #1f77b4" />
4335
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="417.466666" style="fill: #1f77b4; stroke: #1f77b4" />
4336
  </g>
4337
  </g>
4338
  <g id="series--torch-eager" class="series">
4339
+ <path d="M 83.325193 401.186778 L 114.286231 389.759438 L 145.247268 389.88346 L 176.208306 391.429594 L 207.169343 391.437862 L 238.130381 391.3064 L 269.091418 390.148866 L 300.052455 391.892608 L 331.013493 391.718978 L 361.97453 391.488298 L 392.935568 327.44112 L 423.896605 323.861033 L 454.857643 393.09975 L 485.81868 392.802099 L 516.779718 391.967848 L 547.740755 391.330377 L 578.701793 391.702442 L 609.66283 391.388254 L 640.623868 392.347353 L 671.584905 392.611933 L 702.545943 382.05357 L 733.50698 376.74545 L 764.468018 59.053655 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4340
  <defs>
4341
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4342
  </defs>
4343
  <g clip-path="url(#pb49fc4c8d2)">
4344
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="401.186778" style="fill: #ff7f0e; stroke: #ff7f0e" />
4345
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="389.759438" style="fill: #ff7f0e; stroke: #ff7f0e" />
4346
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="389.88346" style="fill: #ff7f0e; stroke: #ff7f0e" />
4347
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="391.429594" style="fill: #ff7f0e; stroke: #ff7f0e" />
4348
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="391.437862" style="fill: #ff7f0e; stroke: #ff7f0e" />
4349
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="391.3064" style="fill: #ff7f0e; stroke: #ff7f0e" />
4350
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="390.148866" style="fill: #ff7f0e; stroke: #ff7f0e" />
4351
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="391.892608" style="fill: #ff7f0e; stroke: #ff7f0e" />
4352
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="391.718978" style="fill: #ff7f0e; stroke: #ff7f0e" />
4353
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="391.488298" style="fill: #ff7f0e; stroke: #ff7f0e" />
4354
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="327.44112" style="fill: #ff7f0e; stroke: #ff7f0e" />
4355
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="323.861033" style="fill: #ff7f0e; stroke: #ff7f0e" />
4356
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="393.09975" style="fill: #ff7f0e; stroke: #ff7f0e" />
4357
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="392.802099" style="fill: #ff7f0e; stroke: #ff7f0e" />
4358
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="391.967848" style="fill: #ff7f0e; stroke: #ff7f0e" />
4359
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="391.330377" style="fill: #ff7f0e; stroke: #ff7f0e" />
4360
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="391.702442" style="fill: #ff7f0e; stroke: #ff7f0e" />
4361
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="391.388254" style="fill: #ff7f0e; stroke: #ff7f0e" />
4362
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="392.347353" style="fill: #ff7f0e; stroke: #ff7f0e" />
4363
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="392.611933" style="fill: #ff7f0e; stroke: #ff7f0e" />
4364
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="382.05357" style="fill: #ff7f0e; stroke: #ff7f0e" />
4365
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="376.74545" style="fill: #ff7f0e; stroke: #ff7f0e" />
4366
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="59.053655" style="fill: #ff7f0e; stroke: #ff7f0e" />
4367
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
4368
  </g>
4369
  </g>
 
4422
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4423
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4424
  </span> |
4425
+ Cell: combine | 4.67s
4426
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4427
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4428
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4547
  torch_eager cuda_B2_D64_S512_W4 0.08 True
4548
  torch_eager cuda_B4_D2048_S128_W2 0.08 True
4549
  torch_eager cuda_B4_D2048_S128_W4 0.08 True
4550
+ torch_eager cuda_B4_D2048_S2048_W2 0.48 True
4551
  torch_eager cuda_B4_D2048_S2048_W4 0.50 True
4552
  torch_eager cuda_B4_D2048_S512_W2 0.09 True
4553
  torch_eager cuda_B4_D2048_S512_W4 0.10 True
 
4576
  <div class="uv-install-logs" id="uv-logs-combine">
4577
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4578
  <div class="uv-logs-content" style="display: none;">
4579
+ Installed 37 packages in 343ms
4580
  </div>
4581
  </div>
4582
  <div class="cell-artifacts">
 
4589
  <rdf:RDF>
4590
  <ns2:Work>
4591
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4592
+ <dc:date>2025-12-19T23:02:31.637981</dc:date>
4593
  <dc:format>image/svg+xml</dc:format>
4594
  <dc:creator>
4595
  <ns2:Agent>
 
4933
  <g id="matplotlib.axis_2">
4934
  <g id="ytick_1">
4935
  <g id="grid-y--2" class="grid grid-y">
4936
+ <path d="M 47.72 375.771468 L 831.034248 375.771468 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4937
  </g>
4938
  <g id="line2d_25">
4939
  <defs>
4940
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4941
  </defs>
4942
  <g>
4943
+ <use ns4:href="#m0fca2865ba" x="47.72" y="375.771468" style="stroke: #000000; stroke-width: 0.8" />
4944
  </g>
4945
  </g>
4946
  <g id="text_25">
4947
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="379.570687" transform="rotate(-0 40.72 379.570687)">0.1</text>
4948
  </g>
4949
  </g>
4950
  <g id="ytick_2">
4951
  <g id="grid-y--3" class="grid grid-y">
4952
+ <path d="M 47.72 293.090475 L 831.034248 293.090475 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4953
  </g>
4954
  <g id="line2d_26">
4955
  <g>
4956
+ <use ns4:href="#m0fca2865ba" x="47.72" y="293.090475" style="stroke: #000000; stroke-width: 0.8" />
4957
  </g>
4958
  </g>
4959
  <g id="text_26">
4960
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="296.889693" transform="rotate(-0 40.72 296.889693)">0.2</text>
4961
  </g>
4962
  </g>
4963
  <g id="ytick_3">
4964
  <g id="grid-y--4" class="grid grid-y">
4965
+ <path d="M 47.72 210.409481 L 831.034248 210.409481 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4966
  </g>
4967
  <g id="line2d_27">
4968
  <g>
4969
+ <use ns4:href="#m0fca2865ba" x="47.72" y="210.409481" style="stroke: #000000; stroke-width: 0.8" />
4970
  </g>
4971
  </g>
4972
  <g id="text_27">
4973
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="214.2087" transform="rotate(-0 40.72 214.2087)">0.3</text>
4974
  </g>
4975
  </g>
4976
  <g id="ytick_4">
4977
  <g id="grid-y--5" class="grid grid-y">
4978
+ <path d="M 47.72 127.728488 L 831.034248 127.728488 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4979
  </g>
4980
  <g id="line2d_28">
4981
  <g>
4982
+ <use ns4:href="#m0fca2865ba" x="47.72" y="127.728488" style="stroke: #000000; stroke-width: 0.8" />
4983
  </g>
4984
  </g>
4985
  <g id="text_28">
4986
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="131.527707" transform="rotate(-0 40.72 131.527707)">0.4</text>
4987
  </g>
4988
  </g>
4989
  <g id="ytick_5">
4990
  <g id="grid-y--6" class="grid grid-y">
4991
+ <path d="M 47.72 45.047495 L 831.034248 45.047495 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4992
  </g>
4993
  <g id="line2d_29">
4994
  <g>
4995
+ <use ns4:href="#m0fca2865ba" x="47.72" y="45.047495" style="stroke: #000000; stroke-width: 0.8" />
4996
  </g>
4997
  </g>
4998
  <g id="text_29">
4999
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="48.846713" transform="rotate(-0 40.72 48.846713)">0.5</text>
5000
  </g>
5001
  </g>
5002
  <g id="label--y" class="ylabel">
 
5004
  </g>
5005
  </g>
5006
  <g id="series--hf-kernels-causal-conv1d" class="series">
5007
+ <path d="M 83.325193 420.186871 L 114.286231 415.788242 L 145.247268 415.192112 L 176.208306 416.325668 L 207.169343 417.499739 L 238.130381 417.458398 L 269.091418 417.259137 L 300.052455 417.251695 L 331.013493 417.656832 L 361.97453 417.400521 L 392.935568 418.152918 L 423.896605 417.49147 L 454.857643 417.541079 L 485.81868 417.425325 L 516.779718 416.656392 L 547.740755 416.028017 L 578.701793 417.391426 L 609.66283 417.293036 L 640.623868 416.912703 L 671.584905 417.367449 L 702.545943 417.425325 L 733.50698 417.400521 L 764.468018 417.904875 L 795.429055 417.466666 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
5008
  <defs>
5009
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
5010
  </defs>
5011
  <g clip-path="url(#pb49fc4c8d2)">
5012
  <use ns4:href="#md7efaf3aec" x="83.325193" y="420.186871" style="fill: #1f77b4; stroke: #1f77b4" />
5013
+ <use ns4:href="#md7efaf3aec" x="114.286231" y="415.788242" style="fill: #1f77b4; stroke: #1f77b4" />
5014
+ <use ns4:href="#md7efaf3aec" x="145.247268" y="415.192112" style="fill: #1f77b4; stroke: #1f77b4" />
5015
+ <use ns4:href="#md7efaf3aec" x="176.208306" y="416.325668" style="fill: #1f77b4; stroke: #1f77b4" />
5016
+ <use ns4:href="#md7efaf3aec" x="207.169343" y="417.499739" style="fill: #1f77b4; stroke: #1f77b4" />
5017
+ <use ns4:href="#md7efaf3aec" x="238.130381" y="417.458398" style="fill: #1f77b4; stroke: #1f77b4" />
5018
+ <use ns4:href="#md7efaf3aec" x="269.091418" y="417.259137" style="fill: #1f77b4; stroke: #1f77b4" />
5019
+ <use ns4:href="#md7efaf3aec" x="300.052455" y="417.251695" style="fill: #1f77b4; stroke: #1f77b4" />
5020
+ <use ns4:href="#md7efaf3aec" x="331.013493" y="417.656832" style="fill: #1f77b4; stroke: #1f77b4" />
5021
+ <use ns4:href="#md7efaf3aec" x="361.97453" y="417.400521" style="fill: #1f77b4; stroke: #1f77b4" />
5022
+ <use ns4:href="#md7efaf3aec" x="392.935568" y="418.152918" style="fill: #1f77b4; stroke: #1f77b4" />
5023
+ <use ns4:href="#md7efaf3aec" x="423.896605" y="417.49147" style="fill: #1f77b4; stroke: #1f77b4" />
5024
+ <use ns4:href="#md7efaf3aec" x="454.857643" y="417.541079" style="fill: #1f77b4; stroke: #1f77b4" />
5025
+ <use ns4:href="#md7efaf3aec" x="485.81868" y="417.425325" style="fill: #1f77b4; stroke: #1f77b4" />
5026
+ <use ns4:href="#md7efaf3aec" x="516.779718" y="416.656392" style="fill: #1f77b4; stroke: #1f77b4" />
5027
+ <use ns4:href="#md7efaf3aec" x="547.740755" y="416.028017" style="fill: #1f77b4; stroke: #1f77b4" />
5028
+ <use ns4:href="#md7efaf3aec" x="578.701793" y="417.391426" style="fill: #1f77b4; stroke: #1f77b4" />
5029
+ <use ns4:href="#md7efaf3aec" x="609.66283" y="417.293036" style="fill: #1f77b4; stroke: #1f77b4" />
5030
+ <use ns4:href="#md7efaf3aec" x="640.623868" y="416.912703" style="fill: #1f77b4; stroke: #1f77b4" />
5031
+ <use ns4:href="#md7efaf3aec" x="671.584905" y="417.367449" style="fill: #1f77b4; stroke: #1f77b4" />
5032
+ <use ns4:href="#md7efaf3aec" x="702.545943" y="417.425325" style="fill: #1f77b4; stroke: #1f77b4" />
5033
+ <use ns4:href="#md7efaf3aec" x="733.50698" y="417.400521" style="fill: #1f77b4; stroke: #1f77b4" />
5034
+ <use ns4:href="#md7efaf3aec" x="764.468018" y="417.904875" style="fill: #1f77b4; stroke: #1f77b4" />
5035
+ <use ns4:href="#md7efaf3aec" x="795.429055" y="417.466666" style="fill: #1f77b4; stroke: #1f77b4" />
5036
  </g>
5037
  </g>
5038
  <g id="series--torch-eager" class="series">
5039
+ <path d="M 83.325193 401.186778 L 114.286231 389.759438 L 145.247268 389.88346 L 176.208306 391.429594 L 207.169343 391.437862 L 238.130381 391.3064 L 269.091418 390.148866 L 300.052455 391.892608 L 331.013493 391.718978 L 361.97453 391.488298 L 392.935568 327.44112 L 423.896605 323.861033 L 454.857643 393.09975 L 485.81868 392.802099 L 516.779718 391.967848 L 547.740755 391.330377 L 578.701793 391.702442 L 609.66283 391.388254 L 640.623868 392.347353 L 671.584905 392.611933 L 702.545943 382.05357 L 733.50698 376.74545 L 764.468018 59.053655 L 795.429055 45.608899 " clip-path="url(#pb49fc4c8d2)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
5040
  <defs>
5041
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
5042
  </defs>
5043
  <g clip-path="url(#pb49fc4c8d2)">
5044
+ <use ns4:href="#m9b8c54d372" x="83.325193" y="401.186778" style="fill: #ff7f0e; stroke: #ff7f0e" />
5045
+ <use ns4:href="#m9b8c54d372" x="114.286231" y="389.759438" style="fill: #ff7f0e; stroke: #ff7f0e" />
5046
+ <use ns4:href="#m9b8c54d372" x="145.247268" y="389.88346" style="fill: #ff7f0e; stroke: #ff7f0e" />
5047
+ <use ns4:href="#m9b8c54d372" x="176.208306" y="391.429594" style="fill: #ff7f0e; stroke: #ff7f0e" />
5048
+ <use ns4:href="#m9b8c54d372" x="207.169343" y="391.437862" style="fill: #ff7f0e; stroke: #ff7f0e" />
5049
+ <use ns4:href="#m9b8c54d372" x="238.130381" y="391.3064" style="fill: #ff7f0e; stroke: #ff7f0e" />
5050
+ <use ns4:href="#m9b8c54d372" x="269.091418" y="390.148866" style="fill: #ff7f0e; stroke: #ff7f0e" />
5051
+ <use ns4:href="#m9b8c54d372" x="300.052455" y="391.892608" style="fill: #ff7f0e; stroke: #ff7f0e" />
5052
+ <use ns4:href="#m9b8c54d372" x="331.013493" y="391.718978" style="fill: #ff7f0e; stroke: #ff7f0e" />
5053
+ <use ns4:href="#m9b8c54d372" x="361.97453" y="391.488298" style="fill: #ff7f0e; stroke: #ff7f0e" />
5054
+ <use ns4:href="#m9b8c54d372" x="392.935568" y="327.44112" style="fill: #ff7f0e; stroke: #ff7f0e" />
5055
+ <use ns4:href="#m9b8c54d372" x="423.896605" y="323.861033" style="fill: #ff7f0e; stroke: #ff7f0e" />
5056
+ <use ns4:href="#m9b8c54d372" x="454.857643" y="393.09975" style="fill: #ff7f0e; stroke: #ff7f0e" />
5057
+ <use ns4:href="#m9b8c54d372" x="485.81868" y="392.802099" style="fill: #ff7f0e; stroke: #ff7f0e" />
5058
+ <use ns4:href="#m9b8c54d372" x="516.779718" y="391.967848" style="fill: #ff7f0e; stroke: #ff7f0e" />
5059
+ <use ns4:href="#m9b8c54d372" x="547.740755" y="391.330377" style="fill: #ff7f0e; stroke: #ff7f0e" />
5060
+ <use ns4:href="#m9b8c54d372" x="578.701793" y="391.702442" style="fill: #ff7f0e; stroke: #ff7f0e" />
5061
+ <use ns4:href="#m9b8c54d372" x="609.66283" y="391.388254" style="fill: #ff7f0e; stroke: #ff7f0e" />
5062
+ <use ns4:href="#m9b8c54d372" x="640.623868" y="392.347353" style="fill: #ff7f0e; stroke: #ff7f0e" />
5063
+ <use ns4:href="#m9b8c54d372" x="671.584905" y="392.611933" style="fill: #ff7f0e; stroke: #ff7f0e" />
5064
+ <use ns4:href="#m9b8c54d372" x="702.545943" y="382.05357" style="fill: #ff7f0e; stroke: #ff7f0e" />
5065
+ <use ns4:href="#m9b8c54d372" x="733.50698" y="376.74545" style="fill: #ff7f0e; stroke: #ff7f0e" />
5066
+ <use ns4:href="#m9b8c54d372" x="764.468018" y="59.053655" style="fill: #ff7f0e; stroke: #ff7f0e" />
5067
  <use ns4:href="#m9b8c54d372" x="795.429055" y="45.608899" style="fill: #ff7f0e; stroke: #ff7f0e" />
5068
  </g>
5069
  </g>
deformable_detr/impls/artifacts/benchmark/deformable_detr.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"ts": "2025-12-19T19:41:40Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.4261789999732173, "p50": 3.457469000011315, "p90": 3.459429999963959, "mean": 3.4539671999937127, "iqr": 0.0022309999962999427, "raw_times": [3.459429999963959, 3.4695590000524135, 3.457469000011315, 3.457198999967659, 3.4261789999732173], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.4921710000048733, "peak_bytes": 5929472, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
2
- {"ts": "2025-12-19T19:41:40Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.229746999953932, "p50": 4.235096999991583, "p90": 4.236528000035378, "mean": 4.242877599995154, "iqr": 0.0027010000280824897, "raw_times": [4.236528000035378, 4.235096999991583, 4.279188999987582, 4.233827000007295, 4.229746999953932], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.256637999958457, "peak_bytes": 15161856, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
3
- {"ts": "2025-12-19T19:41:41Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.2112570000085725, "p50": 4.251798000041163, "p90": 4.262317999973675, "mean": 4.247635800004446, "iqr": 0.04195999997591571, "raw_times": [4.2112570000085725, 4.262317999973675, 4.220357999997759, 4.251798000041163, 4.292448000001059], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.28278900000123, "peak_bytes": 11958784, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
4
- {"ts": "2025-12-19T19:41:41Z", "run": "e6f28dfc458847cc825acfc40a1937dc", "impl": "torch_eager", "tags": {"family": "pytorch", "backend": "eager"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 4.324839999981123, "p50": 4.342328999996425, "p90": 4.349751000006563, "mean": 4.3508561999942685, "iqr": 0.022982000018600957, "raw_times": [4.326768999987962, 4.410591999999269, 4.342328999996425, 4.324839999981123, 4.349751000006563], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 4.316158999984054, "peak_bytes": 30977024, "ok": true, "absmax": 0.0, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 0.0, "mae": 0.0, "mse": 0.0, "ref": "deformable_detr_torch"}, "err": null}
 
1
+ {"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q100_H8_E256_L4_P4", "batch_size": 1, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.03611099987210764, "p50": 0.037491000057343626, "p90": 0.038670999856549315, "mean": 0.03807699995377334, "iqr": 0.0014299998838396277, "raw_times": [0.04087100001015642, 0.038670999856549315, 0.03724099997270969, 0.037491000057343626, 0.03611099987210764], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04587100011121947, "peak_bytes": 2264064, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.539113701047427e-08, "mse": 6.418638644407112e-15, "ref": "deformable_detr_torch"}, "err": null}
2
+ {"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_Q300_H8_E256_L4_P4", "batch_size": 1, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0427610000315326, "p50": 0.04391099992062664, "p90": 0.04453099995771481, "mean": 0.043983000023217755, "iqr": 0.0007099997674231417, "raw_times": [0.0427610000315326, 0.04453099995771481, 0.044891000015923055, 0.04382100019029167, 0.04391099992062664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04723100005321612, "peak_bytes": 4004864, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.559346050176828e-08, "mse": 6.4289483059246175e-15, "ref": "deformable_detr_torch"}, "err": null}
3
+ {"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q100_H8_E256_L4_P4", "batch_size": 2, "num_queries": 100, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04207100005260145, "p50": 0.04387099988889531, "p90": 0.044481000031737494, "mean": 0.04371499999251682, "iqr": 0.0019200001588615123, "raw_times": [0.04387099988889531, 0.044481000031737494, 0.04559100011647388, 0.04256099987287598, 0.04207100005260145], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04643099987333699, "peak_bytes": 5459968, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.555110149657594e-08, "mse": 6.418781369458724e-15, "ref": "deformable_detr_torch"}, "err": null}
4
+ {"ts": "2025-12-19T23:02:21Z", "run": "bd3674eb0704484693460041fd14f59b", "impl": "hf_kernels_deformable_detr", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_Q300_H8_E256_L4_P4", "batch_size": 2, "num_queries": 300, "num_heads": 8, "embed_dim": 256, "num_levels": 4, "num_points": 4, "spatial_shapes": [[32, 32], [16, 16], [8, 8], [4, 4]], "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.04534100003183994, "p50": 0.04615100010596507, "p90": 0.04615100010596507, "mean": 0.045852800030843355, "iqr": 0.0007410001217067474, "raw_times": [0.04615100010596507, 0.04540999998425832, 0.04615100010596507, 0.04534100003183994, 0.04621099992618838], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.04675100012718758, "peak_bytes": 8008704, "ok": true, "absmax": 7.152557373046875e-07, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax": 7.152557373046875e-07, "mae": 5.5905669427147586e-08, "mse": 6.485184940875199e-15, "ref": "deformable_detr_torch"}, "err": null}
deformable_detr/impls/cells/benchmark.py CHANGED
@@ -4,6 +4,7 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
@@ -12,107 +13,30 @@
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
 
 
16
 
17
- def torch_deformable_detr(
 
18
  value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
19
  ):
20
- """
21
- PyTorch native reference implementation of multi-scale deformable attention.
22
- Uses vectorized bilinear interpolation for reasonable performance.
23
- """
24
- bs, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
25
- _, _, _, channels = value.shape
26
-
27
- output = torch.zeros(bs, num_queries, num_heads, channels, device=value.device, dtype=value.dtype)
28
-
29
- # Split value tensor by levels
30
- value_list = value.split([int(h * w) for h, w in spatial_shapes.tolist()], dim=1)
31
-
32
- # Iterate through each level (can't avoid this loop easily)
33
- for level_idx in range(num_levels):
34
- h, w = spatial_shapes[level_idx].tolist()
35
- value_level = value_list[level_idx] # (bs, h*w, num_heads, channels)
36
-
37
- # Reshape to spatial grid: (bs, num_heads, channels, h, w)
38
- value_spatial = value_level.reshape(bs, h, w, num_heads, channels).permute(0, 3, 4, 1, 2)
39
-
40
- # Get sampling locations and weights for this level
41
- # loc: (bs, num_queries, num_heads, num_points, 2)
42
- loc = sampling_locations[:, :, :, level_idx, :, :]
43
- # weight: (bs, num_queries, num_heads, num_points)
44
- weight = attention_weights[:, :, :, level_idx, :]
45
-
46
- # Convert normalized coordinates to pixel coordinates
47
- # loc[..., 0] is x (width), loc[..., 1] is y (height)
48
- x = loc[..., 0] * w - 0.5 # (bs, num_queries, num_heads, num_points)
49
- y = loc[..., 1] * h - 0.5
50
-
51
- # Get integer coordinates for bilinear interpolation
52
- x0 = torch.floor(x).long()
53
- y0 = torch.floor(y).long()
54
- x1 = x0 + 1
55
- y1 = y0 + 1
56
-
57
- # Compute interpolation weights BEFORE clamping (important!)
58
- lw = x - x0.float() # weight for x direction
59
- lh = y - y0.float() # weight for y direction
60
- hw = 1 - lw
61
- hh = 1 - lh
62
-
63
- # Create mask for valid sample locations
64
- valid = (y > -1) & (x > -1) & (y < h) & (x < w)
65
-
66
- # Create masks for each corner being in bounds
67
- mask_tl = ((y0 >= 0) & (x0 >= 0)).unsqueeze(-1).float()
68
- mask_tr = ((y0 >= 0) & (x1 <= w - 1)).unsqueeze(-1).float()
69
- mask_bl = ((y1 <= h - 1) & (x0 >= 0)).unsqueeze(-1).float()
70
- mask_br = ((y1 <= h - 1) & (x1 <= w - 1)).unsqueeze(-1).float()
71
-
72
- # Clamp coordinates for safe indexing
73
- x0_clamped = torch.clamp(x0, 0, w - 1)
74
- x1_clamped = torch.clamp(x1, 0, w - 1)
75
- y0_clamped = torch.clamp(y0, 0, h - 1)
76
- y1_clamped = torch.clamp(y1, 0, h - 1)
77
-
78
- # Bilinear interpolation weights for all 4 corners
79
- w_tl = (hh * hw).unsqueeze(-1) # top-left: (bs, num_queries, num_heads, num_points, 1)
80
- w_tr = (hh * lw).unsqueeze(-1) # top-right
81
- w_bl = (lh * hw).unsqueeze(-1) # bottom-left
82
- w_br = (lh * lw).unsqueeze(-1) # bottom-right
83
-
84
- # Gather values from the 4 corners using advanced indexing
85
- batch_idx = torch.arange(bs, device=value.device).view(bs, 1, 1, 1).expand(bs, num_queries, num_heads, num_points)
86
- head_idx = torch.arange(num_heads, device=value.device).view(1, 1, num_heads, 1).expand(bs, num_queries, num_heads, num_points)
87
-
88
- # Gather corner values with clamped indices, then apply corner masks
89
- v_tl = value_spatial[batch_idx, head_idx, :, y0_clamped, x0_clamped] * mask_tl
90
- v_tr = value_spatial[batch_idx, head_idx, :, y0_clamped, x1_clamped] * mask_tr
91
- v_bl = value_spatial[batch_idx, head_idx, :, y1_clamped, x0_clamped] * mask_bl
92
- v_br = value_spatial[batch_idx, head_idx, :, y1_clamped, x1_clamped] * mask_br
93
-
94
- # Bilinear interpolation
95
- sampled = w_tl * v_tl + w_tr * v_tr + w_bl * v_bl + w_br * v_br
96
-
97
- # Apply valid mask (only accumulate if entire sample location is valid)
98
- sampled = sampled * valid.unsqueeze(-1).float()
99
-
100
- # Apply attention weights and sum over points
101
- # weight: (bs, num_queries, num_heads, num_points)
102
- # Expand weight: (bs, num_queries, num_heads, num_points, 1)
103
- weighted_sampled = sampled * weight.unsqueeze(-1)
104
-
105
- # Sum over points: (bs, num_queries, num_heads, channels)
106
- output += weighted_sampled.sum(dim=3)
107
-
108
- # Flatten last two dimensions to match kernel output
109
- return output.reshape(bs, num_queries, num_heads * channels)
110
 
111
 
112
  run_benchmark(
113
  kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
114
- impl_name="torch_eager",
115
- impl_tags={"family": "pytorch", "backend": "eager"},
116
- impl_func=torch_deformable_detr,
117
  dtype="float32",
118
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
+ # "kernels",
8
  # ]
9
  #
10
  # [tool.uv.sources]
 
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
+ from kernels import get_kernel
17
 
18
+ # Load the deformable DETR kernel
19
+ deformable_detr = get_kernel("kernels-community/deformable-detr")
20
 
21
+
22
+ def hf_kernels_deformable_detr(
23
  value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step=64
24
  ):
25
+ """HuggingFace Kernels Deformable DETR Multi-Scale Deformable Attention"""
26
+ return deformable_detr.ms_deform_attn_forward(
27
+ value=value,
28
+ spatial_shapes=spatial_shapes,
29
+ level_start_index=level_start_index,
30
+ sampling_loc=sampling_locations,
31
+ attn_weight=attention_weights,
32
+ im2col_step=im2col_step
33
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
  run_benchmark(
37
  kernel_type=KernelTypeEnum.DEFORMABLE_DETR,
38
+ impl_name="hf_kernels_deformable_detr",
39
+ impl_tags={"family": "hf-kernels", "backend": "cuda"},
40
+ impl_func=hf_kernels_deformable_detr,
41
  dtype="float32",
42
  )
deformable_detr/impls/hf_kernels_deformable_detr.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.28s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,7 +3905,7 @@ Cell: nv | 0.28s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:41:27 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.28s
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 32C P0 120W / 350W | 0MiB / 46068MiB | 92% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.28s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 8.66s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4003,24 +4003,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q100_H8_E256_L4_P4
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 191.775us 760.50% 191.775us 191.775us 1
4007
- hf_kernels_deformable_detr 6.53% 139.932us 99.65% 2.134ms 2.134ms 0.000us 0.00% 26.274us 26.274us 1
4008
- _deformable_detr_57c3d32::ms_deform_attn_forward 3.14% 67.151us 93.12% 1.994ms 664.639us 22.336us 88.58% 26.274us 8.758us 3
4009
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.336us 88.58% 22.336us 7.445us 3
4010
- aten::zeros 0.92% 19.641us 87.16% 1.866ms 622.148us 0.000us 0.00% 3.938us 1.313us 3
4011
- aten::zero_ 0.66% 14.050us 84.59% 1.811ms 603.774us 0.000us 0.00% 3.938us 1.313us 3
4012
- aten::fill_ 1.47% 31.401us 83.93% 1.797ms 599.090us 2.881us 11.42% 3.938us 1.313us 3
4013
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.881us 11.42% 2.881us 0.960us 3
4014
- Activity Buffer Request 80.47% 1.723ms 80.47% 1.723ms 1.723ms 1.057us 4.19% 1.057us 1.057us 1
4015
- aten::empty 1.66% 35.481us 1.66% 35.481us 11.827us 0.000us 0.00% 0.000us 0.000us 3
4016
- cudaLaunchKernel 2.78% 59.511us 2.78% 59.511us 9.919us 0.000us 0.00% 0.000us 0.000us 6
4017
- aten::view 0.84% 17.931us 0.84% 17.931us 2.989us 0.000us 0.00% 0.000us 0.000us 6
4018
- aten::select 1.00% 21.440us 1.20% 25.621us 8.540us 0.000us 0.00% 0.000us 0.000us 3
4019
- aten::as_strided 0.20% 4.181us 0.20% 4.181us 1.394us 0.000us 0.00% 0.000us 0.000us 3
4020
- cudaDeviceSynchronize 0.35% 7.461us 0.35% 7.461us 7.461us 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
- Self CPU time total: 2.141ms
4023
- Self CUDA time total: 25.217us
4024
 
4025
 
4026
 
@@ -4030,24 +4030,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B1_Q300_H8_E256_L4_P4
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 140.638us 537.96% 140.638us 140.638us 1
4034
- hf_kernels_deformable_detr 3.75% 74.302us 99.73% 1.975ms 1.975ms 0.000us 0.00% 27.071us 27.071us 1
4035
- _deformable_detr_57c3d32::ms_deform_attn_forward 1.66% 32.812us 95.98% 1.901ms 633.661us 23.327us 89.23% 27.071us 9.024us 3
4036
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 23.327us 89.23% 23.327us 7.776us 3
4037
- aten::zeros 0.45% 8.890us 92.43% 1.831ms 610.224us 0.000us 0.00% 3.744us 1.248us 3
4038
- aten::zero_ 0.40% 7.970us 91.07% 1.804ms 601.294us 0.000us 0.00% 3.744us 1.248us 3
4039
- aten::fill_ 1.26% 24.969us 90.67% 1.796ms 598.637us 2.816us 10.77% 3.744us 1.248us 3
4040
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.816us 10.77% 2.816us 0.939us 3
4041
- Activity Buffer Request 88.11% 1.745ms 88.11% 1.745ms 1.745ms 0.928us 3.55% 0.928us 0.928us 1
4042
- aten::empty 0.90% 17.900us 0.90% 17.900us 5.967us 0.000us 0.00% 0.000us 0.000us 3
4043
- cudaLaunchKernel 2.05% 40.542us 2.05% 40.542us 6.757us 0.000us 0.00% 0.000us 0.000us 6
4044
- aten::view 0.46% 9.070us 0.46% 9.070us 1.512us 0.000us 0.00% 0.000us 0.000us 6
4045
- aten::select 0.58% 11.410us 0.69% 13.720us 4.573us 0.000us 0.00% 0.000us 0.000us 3
4046
- aten::as_strided 0.12% 2.310us 0.12% 2.310us 0.770us 0.000us 0.00% 0.000us 0.000us 3
4047
- cudaDeviceSynchronize 0.27% 5.400us 0.27% 5.400us 5.400us 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
- Self CPU time total: 1.981ms
4050
- Self CUDA time total: 26.143us
4051
 
4052
 
4053
 
@@ -4057,24 +4057,24 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q100_H8_E256_L4_P4
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 139.424us 546.70% 139.424us 139.424us 1
4061
- hf_kernels_deformable_detr 3.45% 67.322us 99.73% 1.947ms 1.947ms 0.000us 0.00% 26.463us 26.463us 1
4062
- _deformable_detr_57c3d32::ms_deform_attn_forward 1.76% 34.371us 96.28% 1.880ms 626.621us 22.688us 88.96% 26.463us 8.821us 3
4063
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.688us 88.96% 22.688us 7.563us 3
4064
- aten::zeros 0.42% 8.159us 92.58% 1.808ms 602.514us 0.000us 0.00% 3.775us 1.258us 3
4065
- aten::zero_ 0.40% 7.880us 91.30% 1.783ms 594.177us 0.000us 0.00% 3.775us 1.258us 3
4066
- aten::fill_ 1.36% 26.500us 90.89% 1.775ms 591.551us 2.815us 11.04% 3.775us 1.258us 3
4067
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.815us 11.04% 2.815us 0.938us 3
4068
- Activity Buffer Request 88.22% 1.722ms 88.22% 1.722ms 1.722ms 0.960us 3.76% 0.960us 0.960us 1
4069
- aten::empty 0.86% 16.851us 0.86% 16.851us 5.617us 0.000us 0.00% 0.000us 0.000us 3
4070
- cudaLaunchKernel 2.08% 40.632us 2.08% 40.632us 6.772us 0.000us 0.00% 0.000us 0.000us 6
4071
- aten::view 0.52% 10.080us 0.52% 10.080us 1.680us 0.000us 0.00% 0.000us 0.000us 6
4072
- aten::select 0.55% 10.661us 0.66% 12.960us 4.320us 0.000us 0.00% 0.000us 0.000us 3
4073
- aten::as_strided 0.12% 2.299us 0.12% 2.299us 0.766us 0.000us 0.00% 0.000us 0.000us 3
4074
- cudaDeviceSynchronize 0.27% 5.270us 0.27% 5.270us 5.270us 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
- Self CPU time total: 1.952ms
4077
- Self CUDA time total: 25.503us
4078
 
4079
 
4080
 
@@ -4084,42 +4084,42 @@ PROFILE TRACE: hf_kernels_deformable_detr | cuda_B2_Q300_H8_E256_L4_P4
4084
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4085
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
- hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 144.383us 310.53% 144.383us 144.383us 1
4088
- hf_kernels_deformable_detr 3.20% 70.383us 99.77% 2.197ms 2.197ms 0.000us 0.00% 47.520us 47.520us 1
4089
- _deformable_detr_57c3d32::ms_deform_attn_forward 1.51% 33.251us 96.57% 2.127ms 709.009us 43.392us 93.32% 47.520us 15.840us 3
4090
- void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 43.392us 93.32% 43.392us 14.464us 3
4091
- aten::zeros 0.36% 7.853us 93.39% 2.057ms 685.609us 0.000us 0.00% 4.128us 1.376us 3
4092
- aten::zero_ 0.36% 8.030us 92.24% 2.032ms 677.202us 0.000us 0.00% 4.128us 1.376us 3
4093
- aten::fill_ 1.13% 24.791us 91.88% 2.024ms 674.525us 3.104us 6.68% 4.128us 1.376us 3
4094
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.104us 6.68% 3.104us 1.035us 3
4095
- Activity Buffer Request 79.51% 1.751ms 79.51% 1.751ms 1.751ms 1.024us 2.20% 1.024us 1.024us 1
4096
- aten::empty 0.79% 17.369us 0.79% 17.369us 5.790us 0.000us 0.00% 0.000us 0.000us 3
4097
- cudaLaunchKernel 11.88% 261.685us 11.88% 261.685us 43.614us 0.000us 0.00% 0.000us 0.000us 6
4098
- aten::view 0.43% 9.529us 0.43% 9.529us 1.588us 0.000us 0.00% 0.000us 0.000us 6
4099
- aten::select 0.50% 10.960us 0.60% 13.220us 4.407us 0.000us 0.00% 0.000us 0.000us 3
4100
- aten::as_strided 0.10% 2.260us 0.10% 2.260us 0.753us 0.000us 0.00% 0.000us 0.000us 3
4101
- cudaDeviceSynchronize 0.23% 5.101us 0.23% 5.101us 5.101us 0.000us 0.00% 0.000us 0.000us 1
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
- Self CPU time total: 2.203ms
4104
- Self CUDA time total: 46.496us
4105
 
4106
 
4107
  impl wl p50(ms) ok
4108
  hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
4109
- hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.05 True
4110
- hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.05 True
4111
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
4112
  </pre></div>
4113
  <div class="uv-install-logs" id="uv-logs-benchmark">
4114
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4115
  <div class="uv-logs-content" style="display: none;">
4116
- Installed 51 packages in 320ms
4117
  </div>
4118
  </div>
4119
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4120
- Fetching 7 files: 14%|█▍ | 1/7 [00:00&lt;00:00, 8.32it/s]
4121
- Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 7.49it/s]
4122
- Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 10.54it/s]</div>
4123
  <div class="cell-artifacts">
4124
  <h4>Artifacts:</h4>
4125
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:02:11 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
 
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 42C P0 83W / 350W | 0MiB / 46068MiB | 12% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 4.69s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 179.167us 708.76% 179.167us 179.167us 1
4007
+ hf_kernels_deformable_detr 6.05% 126.291us 99.56% 2.078ms 2.078ms 0.000us 0.00% 26.335us 26.335us 1
4008
+ _deformable_detr_57c3d32::ms_deform_attn_forward 2.99% 62.312us 93.50% 1.951ms 650.448us 22.366us 88.48% 26.335us 8.778us 3
4009
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.366us 88.48% 22.366us 7.455us 3
4010
+ aten::zeros 0.88% 18.443us 87.81% 1.832ms 610.824us 0.000us 0.00% 3.969us 1.323us 3
4011
+ aten::zero_ 0.60% 12.470us 85.41% 1.782ms 594.116us 0.000us 0.00% 3.969us 1.323us 3
4012
+ aten::fill_ 1.40% 29.180us 84.81% 1.770ms 589.959us 2.913us 11.52% 3.969us 1.323us 3
4013
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.913us 11.52% 2.913us 0.971us 3
4014
+ Activity Buffer Request 81.41% 1.699ms 81.41% 1.699ms 1.699ms 1.056us 4.18% 1.056us 1.056us 1
4015
+ aten::empty 1.52% 31.680us 1.52% 31.680us 10.560us 0.000us 0.00% 0.000us 0.000us 3
4016
+ cudaLaunchKernel 2.90% 60.481us 2.90% 60.481us 10.080us 0.000us 0.00% 0.000us 0.000us 6
4017
+ aten::view 0.77% 16.170us 0.77% 16.170us 2.695us 0.000us 0.00% 0.000us 0.000us 6
4018
+ aten::select 0.87% 18.140us 1.04% 21.670us 7.223us 0.000us 0.00% 0.000us 0.000us 3
4019
+ aten::as_strided 0.17% 3.530us 0.17% 3.530us 1.177us 0.000us 0.00% 0.000us 0.000us 3
4020
+ cudaDeviceSynchronize 0.44% 9.280us 0.44% 9.280us 9.280us 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
+ Self CPU time total: 2.087ms
4023
+ Self CUDA time total: 25.279us
4024
 
4025
 
4026
 
 
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 134.144us 513.10% 134.144us 134.144us 1
4034
+ hf_kernels_deformable_detr 4.75% 94.541us 99.73% 1.985ms 1.985ms 0.000us 0.00% 27.072us 27.072us 1
4035
+ _deformable_detr_57c3d32::ms_deform_attn_forward 1.59% 31.632us 94.98% 1.890ms 630.031us 23.360us 89.35% 27.072us 9.024us 3
4036
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 23.360us 89.35% 23.360us 7.787us 3
4037
+ aten::zeros 0.38% 7.548us 91.51% 1.821ms 607.010us 0.000us 0.00% 3.712us 1.237us 3
4038
+ aten::zero_ 0.42% 8.279us 90.32% 1.797ms 599.120us 0.000us 0.00% 3.712us 1.237us 3
4039
+ aten::fill_ 1.23% 24.533us 89.90% 1.789ms 596.360us 2.784us 10.65% 3.712us 1.237us 3
4040
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.784us 10.65% 2.784us 0.928us 3
4041
+ Activity Buffer Request 87.33% 1.738ms 87.33% 1.738ms 1.738ms 0.928us 3.55% 0.928us 0.928us 1
4042
+ aten::empty 0.81% 16.122us 0.81% 16.122us 5.374us 0.000us 0.00% 0.000us 0.000us 3
4043
+ cudaLaunchKernel 2.12% 42.110us 2.12% 42.110us 7.018us 0.000us 0.00% 0.000us 0.000us 6
4044
+ aten::view 0.47% 9.440us 0.47% 9.440us 1.573us 0.000us 0.00% 0.000us 0.000us 6
4045
+ aten::select 0.52% 10.420us 0.63% 12.530us 4.177us 0.000us 0.00% 0.000us 0.000us 3
4046
+ aten::as_strided 0.11% 2.110us 0.11% 2.110us 0.703us 0.000us 0.00% 0.000us 0.000us 3
4047
+ cudaDeviceSynchronize 0.27% 5.390us 0.27% 5.390us 5.390us 0.000us 0.00% 0.000us 0.000us 1
4048
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4049
+ Self CPU time total: 1.990ms
4050
+ Self CUDA time total: 26.144us
4051
 
4052
 
4053
 
 
4057
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4058
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4059
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4060
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 132.927us 521.86% 132.927us 132.927us 1
4061
+ hf_kernels_deformable_detr 4.64% 88.002us 99.69% 1.889ms 1.889ms 0.000us 0.00% 26.432us 26.432us 1
4062
+ _deformable_detr_57c3d32::ms_deform_attn_forward 1.65% 31.271us 95.05% 1.801ms 600.270us 22.624us 88.82% 26.432us 8.811us 3
4063
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 22.624us 88.82% 22.624us 7.541us 3
4064
+ aten::zeros 0.45% 8.600us 91.43% 1.732ms 577.433us 0.000us 0.00% 3.808us 1.269us 3
4065
+ aten::zero_ 0.42% 7.879us 90.13% 1.708ms 569.182us 0.000us 0.00% 3.808us 1.269us 3
4066
+ aten::fill_ 1.34% 25.390us 89.71% 1.700ms 566.556us 2.848us 11.18% 3.808us 1.269us 3
4067
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.848us 11.18% 2.848us 0.949us 3
4068
+ Activity Buffer Request 87.00% 1.648ms 87.00% 1.648ms 1.648ms 0.960us 3.77% 0.960us 0.960us 1
4069
+ aten::empty 0.85% 16.152us 0.85% 16.152us 5.384us 0.000us 0.00% 0.000us 0.000us 3
4070
+ cudaLaunchKernel 2.16% 40.982us 2.16% 40.982us 6.830us 0.000us 0.00% 0.000us 0.000us 6
4071
+ aten::view 0.49% 9.259us 0.49% 9.259us 1.543us 0.000us 0.00% 0.000us 0.000us 6
4072
+ aten::select 0.57% 10.851us 0.68% 12.901us 4.300us 0.000us 0.00% 0.000us 0.000us 3
4073
+ aten::as_strided 0.11% 2.050us 0.11% 2.050us 0.683us 0.000us 0.00% 0.000us 0.000us 3
4074
+ cudaDeviceSynchronize 0.31% 5.790us 0.31% 5.790us 5.790us 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
+ Self CPU time total: 1.895ms
4077
+ Self CUDA time total: 25.472us
4078
 
4079
 
4080
 
 
4084
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4085
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4086
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4087
+ hf_kernels_deformable_detr 0.00% 0.000us 0.00% 0.000us 0.000us 141.952us 303.01% 141.952us 141.952us 1
4088
+ hf_kernels_deformable_detr 4.29% 94.562us 99.77% 2.200ms 2.200ms 0.000us 0.00% 47.871us 47.871us 1
4089
+ _deformable_detr_57c3d32::ms_deform_attn_forward 1.45% 32.013us 95.49% 2.106ms 701.872us 43.744us 93.38% 47.871us 15.957us 3
4090
+ void ms_deformable_im2col_gpu_kernel&lt;float&gt;(int, flo... 0.00% 0.000us 0.00% 0.000us 0.000us 43.744us 93.38% 43.744us 14.581us 3
4091
+ aten::zeros 0.35% 7.690us 92.40% 2.038ms 679.194us 0.000us 0.00% 4.127us 1.376us 3
4092
+ aten::zero_ 0.37% 8.230us 91.34% 2.014ms 671.361us 0.000us 0.00% 4.127us 1.376us 3
4093
+ aten::fill_ 1.11% 24.520us 90.96% 2.006ms 668.618us 3.103us 6.62% 4.127us 1.376us 3
4094
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.103us 6.62% 3.103us 1.034us 3
4095
+ Activity Buffer Request 79.72% 1.758ms 79.72% 1.758ms 1.758ms 1.024us 2.19% 1.024us 1.024us 1
4096
+ aten::empty 0.72% 15.810us 0.72% 15.810us 5.270us 0.000us 0.00% 0.000us 0.000us 3
4097
+ cudaLaunchKernel 10.76% 237.325us 10.76% 237.325us 39.554us 0.000us 0.00% 0.000us 0.000us 6
4098
+ aten::view 0.42% 9.159us 0.42% 9.159us 1.527us 0.000us 0.00% 0.000us 0.000us 6
4099
+ aten::select 0.49% 10.790us 0.58% 12.870us 4.290us 0.000us 0.00% 0.000us 0.000us 3
4100
+ aten::as_strided 0.09% 2.080us 0.09% 2.080us 0.693us 0.000us 0.00% 0.000us 0.000us 3
4101
+ cudaDeviceSynchronize 0.23% 4.980us 0.23% 4.980us 4.980us 0.000us 0.00% 0.000us 0.000us 1
4102
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4103
+ Self CPU time total: 2.205ms
4104
+ Self CUDA time total: 46.847us
4105
 
4106
 
4107
  impl wl p50(ms) ok
4108
  hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
4109
+ hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
4110
+ hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
4111
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
4112
  </pre></div>
4113
  <div class="uv-install-logs" id="uv-logs-benchmark">
4114
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4115
  <div class="uv-logs-content" style="display: none;">
4116
+ Installed 14 packages in 12ms
4117
  </div>
4118
  </div>
4119
  <div class="cell-stderr">Fetching 7 files: 0%| | 0/7 [00:00&lt;?, ?it/s]
4120
+ Fetching 7 files: 29%|██▊ | 2/7 [00:00&lt;00:00, 16.18it/s]
4121
+ Fetching 7 files: 71%|███████▏ | 5/7 [00:00&lt;00:00, 9.41it/s]
4122
+ Fetching 7 files: 100%|██████████| 7/7 [00:00&lt;00:00, 13.86it/s]</div>
4123
  <div class="cell-artifacts">
4124
  <h4>Artifacts:</h4>
4125
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
deformable_detr/impls/torch_deformable_detr.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.28s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,7 +3904,7 @@ Cell: nv | 0.28s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:41:27 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.28s
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 32C P0 120W / 350W | 0MiB / 46068MiB | 92% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3935,9 +3935,9 @@ Cell: nv | 0.28s
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
- <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 5.49s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4077,29 +4077,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q100_H8_E256_L4_P4
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 20.954ms 1412.31% 20.954ms 20.954ms 1
4081
- torch_eager 20.81% 4.774ms 99.96% 22.930ms 22.930ms 0.000us 0.00% 1.485ms 1.485ms 1
4082
- aten::index 4.58% 1.051ms 16.46% 3.775ms 78.637us 236.928us 15.97% 370.530us 7.719us 48
4083
- aten::copy_ 4.73% 1.085ms 11.22% 2.575ms 11.756us 365.953us 24.67% 365.953us 1.671us 219
4084
- aten::mul 5.80% 1.330ms 10.04% 2.304ms 12.001us 294.214us 19.83% 294.214us 1.532us 192
4085
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 236.928us 15.97% 236.928us 4.936us 48
4086
- aten::to 0.58% 133.877us 11.21% 2.571ms 15.036us 0.000us 0.00% 232.351us 1.359us 171
4087
- aten::_to_copy 2.31% 530.135us 10.63% 2.437ms 19.815us 0.000us 0.00% 232.351us 1.889us 123
4088
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 202.211us 13.63% 202.211us 1.685us 120
4089
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.998us 11.32% 167.998us 2.000us 84
4090
- aten::contiguous 0.35% 80.702us 8.37% 1.919ms 19.991us 0.000us 0.00% 133.602us 1.392us 96
4091
- aten::clone 0.72% 165.584us 8.01% 1.838ms 19.151us 0.000us 0.00% 133.602us 1.392us 96
4092
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.602us 9.00% 133.602us 1.392us 96
4093
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.712us 7.80% 115.712us 1.205us 96
4094
- aten::__and__ 0.62% 142.312us 4.46% 1.024ms 12.189us 0.000us 0.00% 99.106us 1.180us 84
4095
- aten::bitwise_and 2.26% 518.769us 3.84% 881.597us 10.495us 99.106us 6.68% 99.106us 1.180us 84
4096
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 99.106us 6.68% 99.106us 1.180us 84
4097
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 86.240us 5.81% 86.240us 1.198us 72
4098
- aten::sub 2.18% 500.017us 3.71% 850.631us 11.814us 79.203us 5.34% 79.203us 1.100us 72
4099
- aten::add 1.61% 368.526us 2.74% 627.393us 10.457us 74.431us 5.02% 74.431us 1.241us 60
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
- Self CPU time total: 22.938ms
4102
- Self CUDA time total: 1.484ms
4103
 
4104
 
4105
 
@@ -4109,29 +4109,29 @@ PROFILE TRACE: torch_eager | cuda_B1_Q300_H8_E256_L4_P4
4109
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4110
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4111
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4112
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.509ms 1221.49% 19.509ms 19.509ms 1
4113
- torch_eager 19.85% 4.302ms 99.97% 21.668ms 21.668ms 0.000us 0.00% 1.598ms 1.598ms 1
4114
- aten::index 4.46% 966.583us 16.34% 3.542ms 73.793us 250.148us 15.66% 382.462us 7.968us 48
4115
- aten::copy_ 4.88% 1.058ms 11.66% 2.528ms 11.545us 367.423us 23.01% 367.423us 1.678us 219
4116
- aten::mul 5.89% 1.276ms 10.32% 2.236ms 11.647us 359.260us 22.49% 359.260us 1.871us 192
4117
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 267.420us 16.74% 267.420us 2.228us 120
4118
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 250.148us 15.66% 250.148us 5.211us 48
4119
- aten::to 0.54% 118.126us 10.89% 2.361ms 13.808us 0.000us 0.00% 235.109us 1.375us 171
4120
- aten::_to_copy 1.87% 405.252us 10.35% 2.243ms 18.236us 0.000us 0.00% 235.109us 1.911us 123
4121
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.767us 10.63% 169.767us 2.021us 84
4122
- aten::contiguous 0.36% 77.869us 8.56% 1.855ms 19.322us 0.000us 0.00% 132.314us 1.378us 96
4123
- aten::clone 0.77% 166.617us 8.20% 1.777ms 18.511us 0.000us 0.00% 132.314us 1.378us 96
4124
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.314us 8.28% 132.314us 1.378us 96
4125
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.886us 7.38% 117.886us 1.228us 96
4126
- aten::__and__ 0.36% 78.606us 4.33% 937.927us 11.166us 0.000us 0.00% 105.249us 1.253us 84
4127
- aten::bitwise_and 2.36% 512.411us 3.96% 859.321us 10.230us 105.249us 6.59% 105.249us 1.253us 84
4128
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 105.249us 6.59% 105.249us 1.253us 84
4129
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.480us 6.54% 104.480us 1.451us 72
4130
- aten::add 1.62% 350.142us 2.81% 608.190us 10.136us 91.837us 5.75% 91.837us 1.531us 60
4131
- aten::sub 2.30% 498.767us 3.88% 840.992us 11.680us 80.480us 5.04% 80.480us 1.118us 72
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
- Self CPU time total: 21.675ms
4134
- Self CUDA time total: 1.597ms
4135
 
4136
 
4137
 
@@ -4141,29 +4141,29 @@ PROFILE TRACE: torch_eager | cuda_B2_Q100_H8_E256_L4_P4
4141
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4142
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4143
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4144
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.494ms 1266.42% 19.494ms 19.494ms 1
4145
- torch_eager 20.07% 4.284ms 99.97% 21.345ms 21.345ms 0.000us 0.00% 1.540ms 1.540ms 1
4146
- aten::index 4.57% 976.579us 16.61% 3.546ms 73.876us 243.229us 15.80% 377.664us 7.868us 48
4147
- aten::copy_ 4.96% 1.060ms 11.92% 2.545ms 11.623us 367.712us 23.89% 367.712us 1.679us 219
4148
- aten::mul 6.15% 1.313ms 10.67% 2.278ms 11.865us 325.252us 21.13% 325.252us 1.694us 192
4149
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 243.229us 15.80% 243.229us 5.067us 48
4150
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 234.241us 15.22% 234.241us 1.952us 120
4151
- aten::to 0.55% 117.567us 11.05% 2.359ms 13.796us 0.000us 0.00% 233.277us 1.364us 171
4152
- aten::_to_copy 1.93% 412.957us 10.50% 2.242ms 18.225us 0.000us 0.00% 233.277us 1.897us 123
4153
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.702us 10.96% 168.702us 2.008us 84
4154
- aten::contiguous 0.37% 78.560us 8.76% 1.871ms 19.493us 0.000us 0.00% 134.435us 1.400us 96
4155
- aten::clone 0.72% 153.204us 8.40% 1.793ms 18.675us 0.000us 0.00% 134.435us 1.400us 96
4156
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.435us 8.73% 134.435us 1.400us 96
4157
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.962us 7.53% 115.962us 1.208us 96
4158
- aten::__and__ 0.35% 74.950us 4.35% 927.999us 11.048us 0.000us 0.00% 104.006us 1.238us 84
4159
- aten::bitwise_and 2.36% 503.597us 4.00% 853.049us 10.155us 104.006us 6.76% 104.006us 1.238us 84
4160
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.006us 6.76% 104.006us 1.238us 84
4161
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 95.808us 6.22% 95.808us 1.331us 72
4162
- aten::add 1.68% 357.766us 2.90% 618.339us 10.306us 83.778us 5.44% 83.778us 1.396us 60
4163
- aten::sub 2.21% 472.075us 3.83% 818.182us 11.364us 78.946us 5.13% 78.946us 1.096us 72
4164
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4165
- Self CPU time total: 21.351ms
4166
- Self CUDA time total: 1.539ms
4167
 
4168
 
4169
 
@@ -4173,37 +4173,43 @@ PROFILE TRACE: torch_eager | cuda_B2_Q300_H8_E256_L4_P4
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4175
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4176
- torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 21.106ms 1191.42% 21.106ms 21.106ms 1
4177
- torch_eager 20.48% 4.473ms 99.97% 21.833ms 21.833ms 0.000us 0.00% 1.773ms 1.773ms 1
4178
- aten::mul 6.38% 1.394ms 11.03% 2.409ms 12.546us 451.910us 25.51% 451.910us 2.354us 192
4179
- aten::index 4.81% 1.050ms 17.73% 3.872ms 80.660us 281.474us 15.89% 419.235us 8.734us 48
4180
- aten::copy_ 5.13% 1.119ms 12.00% 2.622ms 11.970us 371.967us 21.00% 371.967us 1.698us 219
4181
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 357.220us 20.17% 357.220us 2.977us 120
4182
- void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 281.474us 15.89% 281.474us 5.864us 48
4183
- aten::to 0.62% 134.727us 11.66% 2.546ms 14.889us 0.000us 0.00% 234.206us 1.370us 171
4184
- aten::_to_copy 2.10% 458.958us 11.04% 2.411ms 19.605us 0.000us 0.00% 234.206us 1.904us 123
4185
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 168.509us 9.51% 168.509us 2.006us 84
4186
- aten::contiguous 0.48% 104.345us 9.14% 1.996ms 20.797us 0.000us 0.00% 137.761us 1.435us 96
4187
- aten::clone 0.85% 185.548us 8.66% 1.892ms 19.710us 0.000us 0.00% 137.761us 1.435us 96
4188
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 137.761us 7.78% 137.761us 1.435us 96
4189
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 129.798us 7.33% 129.798us 1.803us 72
4190
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 120.063us 6.78% 120.063us 1.251us 96
4191
- aten::add 1.83% 400.048us 3.06% 668.907us 11.148us 114.148us 6.44% 114.148us 1.902us 60
4192
- aten::__and__ 0.43% 94.485us 4.77% 1.041ms 12.390us 0.000us 0.00% 108.862us 1.296us 84
4193
- aten::bitwise_and 2.65% 579.339us 4.33% 946.258us 11.265us 108.862us 6.15% 108.862us 1.296us 84
4194
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 108.862us 6.15% 108.862us 1.296us 84
4195
- aten::sub 2.45% 535.892us 4.10% 895.598us 12.439us 84.572us 4.77% 84.572us 1.175us 72
4196
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4197
- Self CPU time total: 21.838ms
4198
- Self CUDA time total: 1.771ms
4199
 
4200
 
4201
  impl wl p50(ms) ok
4202
- torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.46 True
4203
- torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.24 True
4204
- torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.25 True
4205
- torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.34 True
4206
  </pre></div>
 
 
 
 
 
 
4207
  <div class="cell-artifacts">
4208
  <h4>Artifacts:</h4>
4209
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:02:11 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
 
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 42C P0 83W / 350W | 0MiB / 46068MiB | 12% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3935
  <span class="collapse-indicators">
3936
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
+ <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 9.26s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4077
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4078
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4079
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4080
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.976ms 1348.57% 19.976ms 19.976ms 1
4081
+ torch_eager 20.04% 4.395ms 99.96% 21.929ms 21.929ms 0.000us 0.00% 1.482ms 1.482ms 1
4082
+ aten::index 4.53% 992.766us 16.58% 3.638ms 75.786us 236.544us 15.97% 370.336us 7.715us 48
4083
+ aten::copy_ 4.69% 1.028ms 11.56% 2.535ms 11.576us 366.053us 24.71% 366.053us 1.671us 219
4084
+ aten::mul 5.90% 1.295ms 10.04% 2.203ms 11.474us 293.531us 19.82% 293.531us 1.529us 192
4085
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 236.544us 15.97% 236.544us 4.928us 48
4086
+ aten::to 0.58% 126.843us 11.27% 2.473ms 14.461us 0.000us 0.00% 232.261us 1.358us 171
4087
+ aten::_to_copy 1.95% 426.950us 10.69% 2.346ms 19.073us 0.000us 0.00% 232.261us 1.888us 123
4088
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 201.821us 13.62% 201.821us 1.682us 120
4089
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.778us 11.33% 167.778us 1.997us 84
4090
+ aten::contiguous 0.36% 78.966us 8.52% 1.869ms 19.471us 0.000us 0.00% 133.792us 1.394us 96
4091
+ aten::clone 0.74% 161.750us 8.16% 1.790ms 18.648us 0.000us 0.00% 133.792us 1.394us 96
4092
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.792us 9.03% 133.792us 1.394us 96
4093
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.553us 7.80% 115.553us 1.204us 96
4094
+ aten::__and__ 0.42% 91.609us 4.49% 984.808us 11.724us 0.000us 0.00% 99.041us 1.179us 84
4095
+ aten::bitwise_and 2.54% 557.575us 4.07% 893.199us 10.633us 99.041us 6.69% 99.041us 1.179us 84
4096
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 99.041us 6.69% 99.041us 1.179us 84
4097
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 86.140us 5.82% 86.140us 1.196us 72
4098
+ aten::sub 2.17% 475.165us 3.61% 791.992us 11.000us 79.197us 5.35% 79.197us 1.100us 72
4099
+ aten::add 1.62% 354.490us 2.70% 592.103us 9.868us 74.334us 5.02% 74.334us 1.239us 60
4100
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4101
+ Self CPU time total: 21.937ms
4102
+ Self CUDA time total: 1.481ms
4103
 
4104
 
4105
 
 
4109
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4110
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4111
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4112
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.069ms 1196.67% 19.069ms 19.069ms 1
4113
+ torch_eager 19.87% 4.152ms 99.97% 20.886ms 20.886ms 0.000us 0.00% 1.594ms 1.594ms 1
4114
+ aten::index 4.48% 935.232us 16.67% 3.483ms 72.569us 249.668us 15.67% 382.147us 7.961us 48
4115
+ aten::copy_ 4.80% 1.003ms 11.85% 2.477ms 11.308us 366.556us 23.00% 366.556us 1.674us 219
4116
+ aten::mul 6.04% 1.262ms 10.39% 2.170ms 11.304us 358.714us 22.51% 358.714us 1.868us 192
4117
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 267.167us 16.77% 267.167us 2.226us 120
4118
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 249.668us 15.67% 249.668us 5.201us 48
4119
+ aten::to 0.60% 125.408us 11.23% 2.347ms 13.724us 0.000us 0.00% 234.077us 1.369us 171
4120
+ aten::_to_copy 1.87% 389.897us 10.63% 2.221ms 18.060us 0.000us 0.00% 234.077us 1.903us 123
4121
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.728us 10.65% 169.728us 2.021us 84
4122
+ aten::contiguous 0.35% 74.120us 8.81% 1.840ms 19.167us 0.000us 0.00% 132.479us 1.380us 96
4123
+ aten::clone 0.79% 164.425us 8.45% 1.766ms 18.395us 0.000us 0.00% 132.479us 1.380us 96
4124
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 132.479us 8.31% 132.479us 1.380us 96
4125
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.475us 7.37% 117.475us 1.224us 96
4126
+ aten::__and__ 0.44% 90.959us 4.50% 941.006us 11.202us 0.000us 0.00% 105.476us 1.256us 84
4127
+ aten::bitwise_and 2.49% 520.216us 4.07% 850.047us 10.120us 105.476us 6.62% 105.476us 1.256us 84
4128
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 105.476us 6.62% 105.476us 1.256us 84
4129
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.197us 6.54% 104.197us 1.447us 72
4130
+ aten::add 1.62% 338.151us 2.73% 570.998us 9.517us 91.678us 5.75% 91.678us 1.528us 60
4131
+ aten::sub 2.14% 447.777us 3.61% 754.447us 10.478us 80.286us 5.04% 80.286us 1.115us 72
4132
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4133
+ Self CPU time total: 20.891ms
4134
+ Self CUDA time total: 1.593ms
4135
 
4136
 
4137
 
 
4141
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4142
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4143
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4144
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.677ms 1279.16% 19.677ms 19.677ms 1
4145
+ torch_eager 19.82% 4.280ms 99.97% 21.590ms 21.590ms 0.000us 0.00% 1.539ms 1.539ms 1
4146
+ aten::index 4.49% 970.701us 16.56% 3.576ms 74.506us 243.261us 15.81% 377.688us 7.868us 48
4147
+ aten::copy_ 4.67% 1.008ms 11.52% 2.487ms 11.356us 367.898us 23.92% 367.898us 1.680us 219
4148
+ aten::mul 5.96% 1.287ms 10.22% 2.207ms 11.495us 324.384us 21.09% 324.384us 1.690us 192
4149
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 243.261us 15.81% 243.261us 5.068us 48
4150
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 233.533us 15.18% 233.533us 1.946us 120
4151
+ aten::to 0.57% 122.968us 11.17% 2.413ms 14.109us 0.000us 0.00% 233.471us 1.365us 171
4152
+ aten::_to_copy 1.93% 415.801us 10.60% 2.290ms 18.615us 0.000us 0.00% 233.471us 1.898us 123
4153
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 169.053us 10.99% 169.053us 2.013us 84
4154
+ aten::contiguous 0.37% 80.833us 8.61% 1.859ms 19.360us 0.000us 0.00% 134.427us 1.400us 96
4155
+ aten::clone 0.74% 159.128us 8.23% 1.778ms 18.518us 0.000us 0.00% 134.427us 1.400us 96
4156
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 134.427us 8.74% 134.427us 1.400us 96
4157
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 115.871us 7.53% 115.871us 1.207us 96
4158
+ aten::__and__ 0.43% 92.507us 4.50% 971.781us 11.569us 0.000us 0.00% 104.160us 1.240us 84
4159
+ aten::bitwise_and 2.49% 538.828us 4.07% 879.274us 10.468us 104.160us 6.77% 104.160us 1.240us 84
4160
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.160us 6.77% 104.160us 1.240us 84
4161
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 95.908us 6.23% 95.908us 1.332us 72
4162
+ aten::add 1.64% 354.089us 2.75% 594.321us 9.905us 83.684us 5.44% 83.684us 1.395us 60
4163
+ aten::sub 2.17% 468.302us 3.66% 789.975us 10.972us 79.297us 5.15% 79.297us 1.101us 72
4164
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4165
+ Self CPU time total: 21.596ms
4166
+ Self CUDA time total: 1.538ms
4167
 
4168
 
4169
 
 
4173
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4174
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4175
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4176
+ torch_eager 0.00% 0.000us 0.00% 0.000us 0.000us 19.412ms 1097.11% 19.412ms 19.412ms 1
4177
+ torch_eager 19.43% 4.188ms 99.97% 21.544ms 21.544ms 0.000us 0.00% 1.770ms 1.770ms 1
4178
+ aten::mul 5.88% 1.267ms 10.26% 2.212ms 11.521us 450.496us 25.46% 450.496us 2.346us 192
4179
+ aten::index 4.35% 938.379us 16.41% 3.536ms 73.661us 281.281us 15.90% 418.917us 8.727us 48
4180
+ aten::copy_ 4.72% 1.017ms 12.00% 2.587ms 11.811us 371.333us 20.99% 371.333us 1.696us 219
4181
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 355.809us 20.11% 355.809us 2.965us 120
4182
+ void at::native::index_elementwise_kernel&lt;128, 4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 281.281us 15.90% 281.281us 5.860us 48
4183
+ aten::to 0.57% 122.376us 11.15% 2.403ms 14.050us 0.000us 0.00% 233.697us 1.367us 171
4184
+ aten::_to_copy 1.79% 386.738us 10.58% 2.280ms 18.538us 0.000us 0.00% 233.697us 1.900us 123
4185
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 167.937us 9.49% 167.937us 1.999us 84
4186
+ aten::contiguous 0.36% 77.297us 8.74% 1.884ms 19.624us 0.000us 0.00% 137.636us 1.434us 96
4187
+ aten::clone 0.72% 155.217us 8.38% 1.807ms 18.819us 0.000us 0.00% 137.636us 1.434us 96
4188
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 137.636us 7.78% 137.636us 1.434us 96
4189
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 130.211us 7.36% 130.211us 1.808us 72
4190
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 119.940us 6.78% 119.940us 1.249us 96
4191
+ aten::add 1.56% 336.953us 2.72% 585.265us 9.754us 114.431us 6.47% 114.431us 1.907us 60
4192
+ aten::__and__ 0.41% 88.309us 4.45% 959.250us 11.420us 0.000us 0.00% 108.994us 1.298us 84
4193
+ aten::bitwise_and 2.40% 517.417us 4.04% 870.941us 10.368us 108.994us 6.16% 108.994us 1.298us 84
4194
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 108.994us 6.16% 108.994us 1.298us 84
4195
+ aten::sub 2.15% 464.219us 3.68% 792.358us 11.005us 84.546us 4.78% 84.546us 1.174us 72
4196
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4197
+ Self CPU time total: 21.550ms
4198
+ Self CUDA time total: 1.769ms
4199
 
4200
 
4201
  impl wl p50(ms) ok
4202
+ torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.38 True
4203
+ torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.08 True
4204
+ torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.16 True
4205
+ torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.17 True
4206
  </pre></div>
4207
+ <div class="uv-install-logs" id="uv-logs-benchmark">
4208
+ <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4209
+ <div class="uv-logs-content" style="display: none;">
4210
+ Installed 37 packages in 280ms
4211
+ </div>
4212
+ </div>
4213
  <div class="cell-artifacts">
4214
  <h4>Artifacts:</h4>
4215
  <a href="artifacts/benchmark/deformable_detr.jsonl" class="artifact" target="_blank">deformable_detr.jsonl</a>
deformable_detr/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 88116c3810103702d4e4bca4659d09621c275dbe5bc24360506bd5c5adb84f9c
  • Pointer size: 130 Bytes
  • Size of remote file: 14.9 kB

Git LFS Details

  • SHA256: 39bf256158907575092097d20bcc588a7fb4ce049cb7b107bfda5e17eb6307c7
  • Pointer size: 130 Bytes
  • Size of remote file: 14.9 kB
deformable_detr/results/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:55:30.123615</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -3973,70 +3973,70 @@ body[data-tool="eraser"] .main-content {
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
- <path d="M 39.870649 410.192454 L 824.19299 410.192454 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
- <use ns4:href="#m0fca2865ba" x="39.870649" y="410.192454" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="413.991673" transform="rotate(-0 32.870649 413.991673)">0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
- <path d="M 39.870649 326.087525 L 824.19299 326.087525 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
- <use ns4:href="#m0fca2865ba" x="39.870649" y="326.087525" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="329.886744" transform="rotate(-0 32.870649 329.886744)">1</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
- <path d="M 39.870649 241.982596 L 824.19299 241.982596 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
- <use ns4:href="#m0fca2865ba" x="39.870649" y="241.982596" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="245.781814" transform="rotate(-0 32.870649 245.781814)">2</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
- <path d="M 39.870649 157.877666 L 824.19299 157.877666 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
- <use ns4:href="#m0fca2865ba" x="39.870649" y="157.877666" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="161.676885" transform="rotate(-0 32.870649 161.676885)">3</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
- <path d="M 39.870649 73.772737 L 824.19299 73.772737 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="39.870649" y="73.772737" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="77.571956" transform="rotate(-0 32.870649 77.571956)">4</text>
4040
  </g>
4041
  </g>
4042
  <g id="label--y" class="ylabel">
@@ -4044,26 +4044,26 @@ body[data-tool="eraser"] .main-content {
4044
  </g>
4045
  </g>
4046
  <g id="series--hf-kernels-deformable-detr" class="series">
4047
- <path d="M 75.521665 407.004793 L 313.195102 406.379052 L 550.868538 406.331113 L 788.541975 406.269716 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4048
  <defs>
4049
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4050
  </defs>
4051
  <g clip-path="url(#pbac879f81a)">
4052
  <use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4053
- <use ns4:href="#md7efaf3aec" x="313.195102" y="406.379052" style="fill: #1f77b4; stroke: #1f77b4" />
4054
- <use ns4:href="#md7efaf3aec" x="550.868538" y="406.331113" style="fill: #1f77b4; stroke: #1f77b4" />
4055
- <use ns4:href="#md7efaf3aec" x="788.541975" y="406.269716" style="fill: #1f77b4; stroke: #1f77b4" />
4056
  </g>
4057
  </g>
4058
  <g id="series--torch-eager" class="series">
4059
- <path d="M 75.521665 119.402268 L 313.195102 53.99992 L 550.868538 52.595284 L 788.541975 44.981181 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4060
  <defs>
4061
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4062
  </defs>
4063
  <g clip-path="url(#pbac879f81a)">
4064
- <use ns4:href="#m9b8c54d372" x="75.521665" y="119.402268" style="fill: #ff7f0e; stroke: #ff7f0e" />
4065
- <use ns4:href="#m9b8c54d372" x="313.195102" y="53.99992" style="fill: #ff7f0e; stroke: #ff7f0e" />
4066
- <use ns4:href="#m9b8c54d372" x="550.868538" y="52.595284" style="fill: #ff7f0e; stroke: #ff7f0e" />
4067
  <use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4068
  </g>
4069
  </g>
@@ -4122,7 +4122,7 @@ body[data-tool="eraser"] .main-content {
4122
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4123
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4124
  </span> |
4125
- Cell: combine | 4.63s
4126
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4127
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4128
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4210,13 +4210,13 @@ COMBINED BENCHMARK SUMMARY
4210
 
4211
  impl wl p50(ms) ok
4212
  hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
4213
- hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.05 True
4214
- hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.05 True
4215
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
4216
- torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.46 True
4217
- torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.24 True
4218
- torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.25 True
4219
- torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.34 True
4220
 
4221
  GENERATING COMBINED VISUALIZATION
4222
 
@@ -4236,7 +4236,7 @@ Implementations included:
4236
  <div class="uv-install-logs" id="uv-logs-combine">
4237
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4238
  <div class="uv-logs-content" style="display: none;">
4239
- Installed 37 packages in 311ms
4240
  </div>
4241
  </div>
4242
  <div class="cell-artifacts">
@@ -4249,7 +4249,7 @@ Installed 37 packages in 311ms
4249
  <rdf:RDF>
4250
  <ns2:Work>
4251
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4252
- <dc:date>2025-12-19T19:55:30.123615</dc:date>
4253
  <dc:format>image/svg+xml</dc:format>
4254
  <dc:creator>
4255
  <ns2:Agent>
@@ -4333,70 +4333,70 @@ Installed 37 packages in 311ms
4333
  <g id="matplotlib.axis_2">
4334
  <g id="ytick_1">
4335
  <g id="grid-y--2" class="grid grid-y">
4336
- <path d="M 39.870649 410.192454 L 824.19299 410.192454 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4337
  </g>
4338
  <g id="line2d_5">
4339
  <defs>
4340
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4341
  </defs>
4342
  <g>
4343
- <use ns4:href="#m0fca2865ba" x="39.870649" y="410.192454" style="stroke: #000000; stroke-width: 0.8" />
4344
  </g>
4345
  </g>
4346
  <g id="text_5">
4347
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="413.991673" transform="rotate(-0 32.870649 413.991673)">0</text>
4348
  </g>
4349
  </g>
4350
  <g id="ytick_2">
4351
  <g id="grid-y--3" class="grid grid-y">
4352
- <path d="M 39.870649 326.087525 L 824.19299 326.087525 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4353
  </g>
4354
  <g id="line2d_6">
4355
  <g>
4356
- <use ns4:href="#m0fca2865ba" x="39.870649" y="326.087525" style="stroke: #000000; stroke-width: 0.8" />
4357
  </g>
4358
  </g>
4359
  <g id="text_6">
4360
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="329.886744" transform="rotate(-0 32.870649 329.886744)">1</text>
4361
  </g>
4362
  </g>
4363
  <g id="ytick_3">
4364
  <g id="grid-y--4" class="grid grid-y">
4365
- <path d="M 39.870649 241.982596 L 824.19299 241.982596 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4366
  </g>
4367
  <g id="line2d_7">
4368
  <g>
4369
- <use ns4:href="#m0fca2865ba" x="39.870649" y="241.982596" style="stroke: #000000; stroke-width: 0.8" />
4370
  </g>
4371
  </g>
4372
  <g id="text_7">
4373
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="245.781814" transform="rotate(-0 32.870649 245.781814)">2</text>
4374
  </g>
4375
  </g>
4376
  <g id="ytick_4">
4377
  <g id="grid-y--5" class="grid grid-y">
4378
- <path d="M 39.870649 157.877666 L 824.19299 157.877666 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4379
  </g>
4380
  <g id="line2d_8">
4381
  <g>
4382
- <use ns4:href="#m0fca2865ba" x="39.870649" y="157.877666" style="stroke: #000000; stroke-width: 0.8" />
4383
  </g>
4384
  </g>
4385
  <g id="text_8">
4386
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="161.676885" transform="rotate(-0 32.870649 161.676885)">3</text>
4387
  </g>
4388
  </g>
4389
  <g id="ytick_5">
4390
  <g id="grid-y--6" class="grid grid-y">
4391
- <path d="M 39.870649 73.772737 L 824.19299 73.772737 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4392
  </g>
4393
  <g id="line2d_9">
4394
  <g>
4395
- <use ns4:href="#m0fca2865ba" x="39.870649" y="73.772737" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_9">
4399
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="77.571956" transform="rotate(-0 32.870649 77.571956)">4</text>
4400
  </g>
4401
  </g>
4402
  <g id="label--y" class="ylabel">
@@ -4404,26 +4404,26 @@ Installed 37 packages in 311ms
4404
  </g>
4405
  </g>
4406
  <g id="series--hf-kernels-deformable-detr" class="series">
4407
- <path d="M 75.521665 407.004793 L 313.195102 406.379052 L 550.868538 406.331113 L 788.541975 406.269716 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4408
  <defs>
4409
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4410
  </defs>
4411
  <g clip-path="url(#pbac879f81a)">
4412
  <use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4413
- <use ns4:href="#md7efaf3aec" x="313.195102" y="406.379052" style="fill: #1f77b4; stroke: #1f77b4" />
4414
- <use ns4:href="#md7efaf3aec" x="550.868538" y="406.331113" style="fill: #1f77b4; stroke: #1f77b4" />
4415
- <use ns4:href="#md7efaf3aec" x="788.541975" y="406.269716" style="fill: #1f77b4; stroke: #1f77b4" />
4416
  </g>
4417
  </g>
4418
  <g id="series--torch-eager" class="series">
4419
- <path d="M 75.521665 119.402268 L 313.195102 53.99992 L 550.868538 52.595284 L 788.541975 44.981181 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4420
  <defs>
4421
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4422
  </defs>
4423
  <g clip-path="url(#pbac879f81a)">
4424
- <use ns4:href="#m9b8c54d372" x="75.521665" y="119.402268" style="fill: #ff7f0e; stroke: #ff7f0e" />
4425
- <use ns4:href="#m9b8c54d372" x="313.195102" y="53.99992" style="fill: #ff7f0e; stroke: #ff7f0e" />
4426
- <use ns4:href="#m9b8c54d372" x="550.868538" y="52.595284" style="fill: #ff7f0e; stroke: #ff7f0e" />
4427
  <use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4428
  </g>
4429
  </g>
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T23:02:54.345828</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
+ <path d="M 39.870649 410.286782 L 824.19299 410.286782 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="410.286782" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="414.086" transform="rotate(-0 32.870649 414.086)">0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
+ <path d="M 39.870649 322.746079 L 824.19299 322.746079 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="322.746079" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="326.545298" transform="rotate(-0 32.870649 326.545298)">1</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
+ <path d="M 39.870649 235.205376 L 824.19299 235.205376 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="235.205376" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="239.004595" transform="rotate(-0 32.870649 239.004595)">2</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
+ <path d="M 39.870649 147.664674 L 824.19299 147.664674 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="147.664674" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="151.463893" transform="rotate(-0 32.870649 151.463893)">3</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
+ <path d="M 39.870649 60.123971 L 824.19299 60.123971 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="60.123971" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="63.92319" transform="rotate(-0 32.870649 63.92319)">4</text>
4040
  </g>
4041
  </g>
4042
  <g id="label--y" class="ylabel">
 
4044
  </g>
4045
  </g>
4046
  <g id="series--hf-kernels-deformable-detr" class="series">
4047
+ <path d="M 75.521665 407.004793 L 313.195102 406.442782 L 550.868538 406.446283 L 788.541975 406.246691 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4048
  <defs>
4049
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4050
  </defs>
4051
  <g clip-path="url(#pbac879f81a)">
4052
  <use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4053
+ <use ns4:href="#md7efaf3aec" x="313.195102" y="406.442782" style="fill: #1f77b4; stroke: #1f77b4" />
4054
+ <use ns4:href="#md7efaf3aec" x="550.868538" y="406.446283" style="fill: #1f77b4; stroke: #1f77b4" />
4055
+ <use ns4:href="#md7efaf3aec" x="788.541975" y="406.246691" style="fill: #1f77b4; stroke: #1f77b4" />
4056
  </g>
4057
  </g>
4058
  <g id="series--torch-eager" class="series">
4059
+ <path d="M 75.521665 114.041778 L 313.195102 52.888032 L 550.868538 46.287288 L 788.541975 44.981181 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4060
  <defs>
4061
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4062
  </defs>
4063
  <g clip-path="url(#pbac879f81a)">
4064
+ <use ns4:href="#m9b8c54d372" x="75.521665" y="114.041778" style="fill: #ff7f0e; stroke: #ff7f0e" />
4065
+ <use ns4:href="#m9b8c54d372" x="313.195102" y="52.888032" style="fill: #ff7f0e; stroke: #ff7f0e" />
4066
+ <use ns4:href="#m9b8c54d372" x="550.868538" y="46.287288" style="fill: #ff7f0e; stroke: #ff7f0e" />
4067
  <use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4068
  </g>
4069
  </g>
 
4122
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4123
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4124
  </span> |
4125
+ Cell: combine | 4.41s
4126
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4127
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4128
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4210
 
4211
  impl wl p50(ms) ok
4212
  hf_kernels_deformable_detr cuda_B1_Q100_H8_E256_L4_P4 0.04 True
4213
+ hf_kernels_deformable_detr cuda_B1_Q300_H8_E256_L4_P4 0.04 True
4214
+ hf_kernels_deformable_detr cuda_B2_Q100_H8_E256_L4_P4 0.04 True
4215
  hf_kernels_deformable_detr cuda_B2_Q300_H8_E256_L4_P4 0.05 True
4216
+ torch_eager cuda_B1_Q100_H8_E256_L4_P4 3.38 True
4217
+ torch_eager cuda_B1_Q300_H8_E256_L4_P4 4.08 True
4218
+ torch_eager cuda_B2_Q100_H8_E256_L4_P4 4.16 True
4219
+ torch_eager cuda_B2_Q300_H8_E256_L4_P4 4.17 True
4220
 
4221
  GENERATING COMBINED VISUALIZATION
4222
 
 
4236
  <div class="uv-install-logs" id="uv-logs-combine">
4237
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4238
  <div class="uv-logs-content" style="display: none;">
4239
+ Installed 37 packages in 297ms
4240
  </div>
4241
  </div>
4242
  <div class="cell-artifacts">
 
4249
  <rdf:RDF>
4250
  <ns2:Work>
4251
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4252
+ <dc:date>2025-12-19T23:02:54.345828</dc:date>
4253
  <dc:format>image/svg+xml</dc:format>
4254
  <dc:creator>
4255
  <ns2:Agent>
 
4333
  <g id="matplotlib.axis_2">
4334
  <g id="ytick_1">
4335
  <g id="grid-y--2" class="grid grid-y">
4336
+ <path d="M 39.870649 410.286782 L 824.19299 410.286782 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4337
  </g>
4338
  <g id="line2d_5">
4339
  <defs>
4340
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4341
  </defs>
4342
  <g>
4343
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="410.286782" style="stroke: #000000; stroke-width: 0.8" />
4344
  </g>
4345
  </g>
4346
  <g id="text_5">
4347
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="414.086" transform="rotate(-0 32.870649 414.086)">0</text>
4348
  </g>
4349
  </g>
4350
  <g id="ytick_2">
4351
  <g id="grid-y--3" class="grid grid-y">
4352
+ <path d="M 39.870649 322.746079 L 824.19299 322.746079 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4353
  </g>
4354
  <g id="line2d_6">
4355
  <g>
4356
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="322.746079" style="stroke: #000000; stroke-width: 0.8" />
4357
  </g>
4358
  </g>
4359
  <g id="text_6">
4360
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="326.545298" transform="rotate(-0 32.870649 326.545298)">1</text>
4361
  </g>
4362
  </g>
4363
  <g id="ytick_3">
4364
  <g id="grid-y--4" class="grid grid-y">
4365
+ <path d="M 39.870649 235.205376 L 824.19299 235.205376 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4366
  </g>
4367
  <g id="line2d_7">
4368
  <g>
4369
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="235.205376" style="stroke: #000000; stroke-width: 0.8" />
4370
  </g>
4371
  </g>
4372
  <g id="text_7">
4373
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="239.004595" transform="rotate(-0 32.870649 239.004595)">2</text>
4374
  </g>
4375
  </g>
4376
  <g id="ytick_4">
4377
  <g id="grid-y--5" class="grid grid-y">
4378
+ <path d="M 39.870649 147.664674 L 824.19299 147.664674 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4379
  </g>
4380
  <g id="line2d_8">
4381
  <g>
4382
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="147.664674" style="stroke: #000000; stroke-width: 0.8" />
4383
  </g>
4384
  </g>
4385
  <g id="text_8">
4386
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="151.463893" transform="rotate(-0 32.870649 151.463893)">3</text>
4387
  </g>
4388
  </g>
4389
  <g id="ytick_5">
4390
  <g id="grid-y--6" class="grid grid-y">
4391
+ <path d="M 39.870649 60.123971 L 824.19299 60.123971 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4392
  </g>
4393
  <g id="line2d_9">
4394
  <g>
4395
+ <use ns4:href="#m0fca2865ba" x="39.870649" y="60.123971" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_9">
4399
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="32.870649" y="63.92319" transform="rotate(-0 32.870649 63.92319)">4</text>
4400
  </g>
4401
  </g>
4402
  <g id="label--y" class="ylabel">
 
4404
  </g>
4405
  </g>
4406
  <g id="series--hf-kernels-deformable-detr" class="series">
4407
+ <path d="M 75.521665 407.004793 L 313.195102 406.442782 L 550.868538 406.446283 L 788.541975 406.246691 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4408
  <defs>
4409
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4410
  </defs>
4411
  <g clip-path="url(#pbac879f81a)">
4412
  <use ns4:href="#md7efaf3aec" x="75.521665" y="407.004793" style="fill: #1f77b4; stroke: #1f77b4" />
4413
+ <use ns4:href="#md7efaf3aec" x="313.195102" y="406.442782" style="fill: #1f77b4; stroke: #1f77b4" />
4414
+ <use ns4:href="#md7efaf3aec" x="550.868538" y="406.446283" style="fill: #1f77b4; stroke: #1f77b4" />
4415
+ <use ns4:href="#md7efaf3aec" x="788.541975" y="406.246691" style="fill: #1f77b4; stroke: #1f77b4" />
4416
  </g>
4417
  </g>
4418
  <g id="series--torch-eager" class="series">
4419
+ <path d="M 75.521665 114.041778 L 313.195102 52.888032 L 550.868538 46.287288 L 788.541975 44.981181 " clip-path="url(#pbac879f81a)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4420
  <defs>
4421
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4422
  </defs>
4423
  <g clip-path="url(#pbac879f81a)">
4424
+ <use ns4:href="#m9b8c54d372" x="75.521665" y="114.041778" style="fill: #ff7f0e; stroke: #ff7f0e" />
4425
+ <use ns4:href="#m9b8c54d372" x="313.195102" y="52.888032" style="fill: #ff7f0e; stroke: #ff7f0e" />
4426
+ <use ns4:href="#m9b8c54d372" x="550.868538" y="46.287288" style="fill: #ff7f0e; stroke: #ff7f0e" />
4427
  <use ns4:href="#m9b8c54d372" x="788.541975" y="44.981181" style="fill: #ff7f0e; stroke: #ff7f0e" />
4428
  </g>
4429
  </g>
flash_attn/impls/artifacts/benchmark/attention.jsonl CHANGED
@@ -1,6 +1,6 @@
1
- {"ts": "2025-12-19T19:55:13Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.971173999914754, "p50": 0.9783339999103191, "p90": 0.9836439999162394, "mean": 0.9789179998733744, "iqr": 0.007710000090810354, "raw_times": [0.9783339999103191, 0.975933999825429, 0.9836439999162394, 0.9855039998001303, 0.971173999914754], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0032949999185803, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.0003604888916015625, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
- {"ts": "2025-12-19T19:55:13Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0312540000541048, "p50": 1.039535000018077, "p90": 1.0408949999600736, "mean": 1.0369627999807562, "iqr": 0.00922000003811263, "raw_times": [1.031674999921961, 1.0408949999600736, 1.0414549999495648, 1.0312540000541048, 1.039535000018077], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0439259999657224, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
- {"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.05486600000404, "p50": 1.0608159998355404, "p90": 1.0660549999101931, "mean": 1.062165799930881, "iqr": 0.010128999974767794, "raw_times": [1.0608159998355404, 1.0731659999692056, 1.0559259999354254, 1.05486600000404, 1.0660549999101931], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.0692559999370133, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
- {"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.0765559998162644, "p50": 1.0860869999760325, "p90": 1.0925159999715106, "mean": 1.0862464000183536, "iqr": 0.013049999779468635, "raw_times": [1.0860869999760325, 1.0925159999715106, 1.0765559998162644, 1.079466000192042, 1.0966070001359185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.10497600007875, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
- {"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.247940999974162, "p50": 1.2629510001715971, "p90": 1.2655800001084572, "mean": 1.2603426000623585, "iqr": 0.014840000176263857, "raw_times": [1.2629510001715971, 1.247940999974162, 1.2655800001084572, 1.2507399999321933, 1.2745010001253831], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2752009999985603, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.00035858154296875, "mse": 2.7865171432495117e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
- {"ts": "2025-12-19T19:55:14Z", "run": "e453bd1c3c404adca7ebbffbcb1899bf", "impl": "xformers_meff", "tags": {"family": "xformers", "backend": "memory_efficient", "compile": "none"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.228739999987738, "p50": 1.2448499999209162, "p90": 1.2651710001136962, "mean": 1.2494922000314546, "iqr": 0.028152000140835298, "raw_times": [1.228739999987738, 1.2448499999209162, 1.237018999972861, 1.2716810001620615, 1.2651710001136962], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.262461000123949, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.000362396240234375, "mse": 2.8014183044433594e-06, "ref": "sdpa_math_fp32"}, "err": null}
 
1
+ {"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L128_bfloat16", "batch": 1, "seq_len": 4224, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2105559999326942, "p50": 1.2135660001604265, "p90": 1.214856999922631, "mean": 1.213000200004899, "iqr": 0.0038309999581542797, "raw_times": [1.2149960000442661, 1.2110259999644768, 1.2105559999326942, 1.2135660001604265, 1.214856999922631], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2067360000855842, "peak_bytes": 295567360, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
2
+ {"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L256_bfloat16", "batch": 1, "seq_len": 4352, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2594280001394509, "p50": 1.2752780000937491, "p90": 1.2771070000781037, "mean": 1.2731776000691752, "iqr": 0.010640000027706265, "raw_times": [1.2752780000937491, 1.2664670000503975, 1.2771070000781037, 1.2594280001394509, 1.287607999984175], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2718570001197804, "peak_bytes": 304742400, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
3
+ {"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L320_bfloat16", "batch": 1, "seq_len": 4416, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.2881479999578005, "p50": 1.2985280000066268, "p90": 1.2987470001917245, "mean": 1.2992600000416132, "iqr": 0.008449000233667903, "raw_times": [1.2902979999580566, 1.2881479999578005, 1.2985280000066268, 1.2987470001917245, 1.3205790000938578], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.2919179998789332, "peak_bytes": 307494912, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
4
+ {"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L384_bfloat16", "batch": 1, "seq_len": 4480, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.32487900009437, "p50": 1.3346000000638014, "p90": 1.337429000159318, "mean": 1.3341430000764376, "iqr": 0.006821000170020852, "raw_times": [1.32487900009437, 1.337429000159318, 1.3346000000638014, 1.3306079999892972, 1.3431990000754013], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.327048999883118, "peak_bytes": 311296000, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
5
+ {"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L448_bfloat16", "batch": 1, "seq_len": 4544, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.4795820000017557, "p50": 1.4878020001560799, "p90": 1.4919819998340245, "mean": 1.4892582000356924, "iqr": 0.004879999778495403, "raw_times": [1.4795820000017557, 1.4919819998340245, 1.487102000055529, 1.499823000131073, 1.4878020001560799], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.4706619999742543, "peak_bytes": 315621376, "ok": true, "absmax": 0.0625, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.0625, "mae": 0.000354766845703125, "mse": 2.7418136596679688e-06, "ref": "sdpa_math_fp32"}, "err": null}
6
+ {"ts": "2025-12-19T23:02:00Z", "run": "d08cdddcbd814f0a98850e99a3cc8f3c", "impl": "torch_flash_ma", "tags": {"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"}, "wl": {"name": "cuda_attn_L512_bfloat16", "batch": 1, "seq_len": 4608, "heads": 24, "head_dim": 128, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.507972999888807, "p50": 1.5174029999798222, "p90": 1.518043000032776, "mean": 1.5156109999679757, "iqr": 0.005300000111674308, "raw_times": [1.518043000032776, 1.5174029999798222, 1.5218930000173714, 1.507972999888807, 1.5127429999211017], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.517042999921614, "peak_bytes": 319946752, "ok": true, "absmax": 0.125, "corr": {"ok": true, "rtol": 0.02, "atol": 0.02, "absmax": 0.125, "mae": 0.0003566741943359375, "mse": 2.7567148208618164e-06, "ref": "sdpa_math_fp32"}, "err": null}
flash_attn/impls/cells/benchmark.py CHANGED
@@ -4,7 +4,6 @@
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
7
- # "xformers",
8
  # ]
9
  #
10
  # [tool.uv.sources]
@@ -13,18 +12,18 @@
13
  import torch
14
  import sys
15
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
16
- import xformers.ops as xops
17
 
18
 
19
- def xformers_attention(q, k, v):
20
- """xFormers memory efficient attention"""
21
- # xFormers expects [batch, seq_len, heads, head_dim]
22
- return xops.memory_efficient_attention(q, k, v)
 
23
 
24
 
25
  run_benchmark(
26
  kernel_type=KernelTypeEnum.ATTENTION,
27
- impl_name="xformers_meff",
28
- impl_tags={"family": "xformers", "backend": "memory_efficient", "compile": "none"},
29
- impl_func=xformers_attention,
30
  )
 
4
  # "numpy",
5
  # "torch==2.8.0",
6
  # "kernels-benchmark-tools",
 
7
  # ]
8
  #
9
  # [tool.uv.sources]
 
12
  import torch
13
  import sys
14
  from kernels_benchmark_tools import KernelTypeEnum, run_benchmark
 
15
 
16
 
17
+ def torch_flash(q, k, v):
18
+ qt, kt, vt = (x.transpose(1, 2).contiguous() for x in (q, k, v))
19
+ with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.FLASH_ATTENTION):
20
+ o = torch.nn.functional.scaled_dot_product_attention(qt, kt, vt)
21
+ return o.transpose(1, 2).contiguous()
22
 
23
 
24
  run_benchmark(
25
  kernel_type=KernelTypeEnum.ATTENTION,
26
+ impl_name="torch_flash_ma",
27
+ impl_tags={"family": "torch-sdpa", "backend": "FLASH", "compile": "max-autotune"},
28
+ impl_func=torch_flash,
29
  )
flash_attn/impls/flash_attention.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,7 +3905,7 @@ Cell: nv | 0.25s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:41:23 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.25s
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 31C P0 107W / 350W | 0MiB / 46068MiB | 100% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.25s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 4.12s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3989,29 +3989,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L128_bfloat16
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.560ms 101.41% 3.560ms 3.560ms 1
3993
- torch_flash_ma 6.12% 330.406us 49.12% 2.651ms 2.651ms 0.000us 0.00% 3.550ms 3.550ms 1
3994
- aten::scaled_dot_product_attention 0.76% 41.091us 4.12% 222.225us 74.075us 0.000us 0.00% 2.785ms 928.191us 3
3995
- aten::_scaled_dot_product_flash_attention 0.57% 30.902us 3.36% 181.134us 60.378us 0.000us 0.00% 2.785ms 928.191us 3
3996
- aten::_flash_attention_forward 0.74% 39.881us 2.41% 130.323us 43.441us 2.785ms 79.34% 2.785ms 928.191us 3
3997
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.785ms 79.34% 2.785ms 928.191us 3
3998
- aten::contiguous 0.24% 12.809us 37.68% 2.033ms 169.455us 0.000us 0.00% 765.791us 63.816us 12
3999
- aten::clone 0.64% 34.521us 37.44% 2.021ms 168.387us 0.000us 0.00% 765.791us 63.816us 12
4000
- aten::copy_ 1.67% 90.094us 35.26% 1.903ms 158.570us 725.311us 20.66% 765.791us 63.816us 12
4001
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 725.311us 20.66% 725.311us 60.443us 12
4002
- Activity Buffer Request 31.66% 1.709ms 31.66% 1.709ms 1.709ms 40.480us 1.15% 40.480us 40.480us 1
4003
- aten::transpose 1.17% 63.269us 1.58% 85.140us 3.548us 0.000us 0.00% 0.000us 0.000us 24
4004
- aten::as_strided 0.41% 21.871us 0.41% 21.871us 0.911us 0.000us 0.00% 0.000us 0.000us 24
4005
- aten::empty_like 0.47% 25.421us 1.97% 106.322us 7.088us 0.000us 0.00% 0.000us 0.000us 15
4006
- aten::empty 1.76% 94.971us 1.76% 94.971us 3.957us 0.000us 0.00% 0.000us 0.000us 24
4007
- cudaLaunchKernel 2.37% 128.144us 2.37% 128.144us 8.543us 0.000us 0.00% 0.000us 0.000us 15
4008
- aten::empty_strided 0.32% 17.100us 0.32% 17.100us 5.700us 0.000us 0.00% 0.000us 0.000us 3
4009
- cudaDeviceGetAttribute 0.04% 2.290us 0.04% 2.290us 0.382us 0.000us 0.00% 0.000us 0.000us 6
4010
- cudaFuncSetAttribute 0.18% 9.631us 0.18% 9.631us 3.210us 0.000us 0.00% 0.000us 0.000us 3
4011
- cudaDeviceSynchronize 50.88% 2.746ms 50.88% 2.746ms 2.746ms 0.000us 0.00% 0.000us 0.000us 1
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
- Self CPU time total: 5.397ms
4014
- Self CUDA time total: 3.510ms
4015
 
4016
 
4017
 
@@ -4021,29 +4021,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L256_bfloat16
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4023
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4024
- torch_flash_ma 4.59% 254.063us 44.59% 2.468ms 2.468ms 0.000us 0.00% 3.765ms 3.765ms 1
4025
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.721ms 100.30% 3.721ms 3.721ms 1
4026
- aten::scaled_dot_product_attention 0.43% 23.691us 3.30% 182.385us 60.795us 0.000us 0.00% 2.950ms 983.280us 3
4027
- aten::_scaled_dot_product_flash_attention 0.32% 17.969us 2.87% 158.694us 52.898us 0.000us 0.00% 2.950ms 983.280us 3
4028
- aten::_flash_attention_forward 0.74% 40.930us 2.17% 120.223us 40.074us 2.950ms 79.52% 2.950ms 983.280us 3
4029
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.950ms 79.52% 2.950ms 983.280us 3
4030
- aten::contiguous 0.16% 8.922us 35.94% 1.989ms 165.775us 0.000us 0.00% 815.354us 67.946us 12
4031
- aten::clone 0.46% 25.650us 35.78% 1.980ms 165.031us 0.000us 0.00% 815.354us 67.946us 12
4032
- aten::copy_ 1.41% 78.081us 34.18% 1.891ms 157.619us 759.770us 20.48% 815.354us 67.946us 12
4033
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 759.770us 20.48% 759.770us 63.314us 12
4034
- Activity Buffer Request 31.33% 1.734ms 31.33% 1.734ms 1.734ms 55.584us 1.50% 55.584us 55.584us 1
4035
- aten::transpose 0.84% 46.272us 1.13% 62.592us 2.608us 0.000us 0.00% 0.000us 0.000us 24
4036
- aten::as_strided 0.29% 16.320us 0.29% 16.320us 0.680us 0.000us 0.00% 0.000us 0.000us 24
4037
- aten::empty_like 0.39% 21.392us 1.49% 82.711us 5.514us 0.000us 0.00% 0.000us 0.000us 15
4038
- aten::empty 1.42% 78.721us 1.42% 78.721us 3.280us 0.000us 0.00% 0.000us 0.000us 24
4039
- cudaLaunchKernel 1.84% 101.714us 1.84% 101.714us 6.781us 0.000us 0.00% 0.000us 0.000us 15
4040
- aten::empty_strided 0.25% 13.930us 0.25% 13.930us 4.643us 0.000us 0.00% 0.000us 0.000us 3
4041
- cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6
4042
- cudaFuncSetAttribute 0.08% 4.360us 0.08% 4.360us 1.453us 0.000us 0.00% 0.000us 0.000us 3
4043
- cudaDeviceSynchronize 55.41% 3.067ms 55.41% 3.067ms 3.067ms 0.000us 0.00% 0.000us 0.000us 1
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
  Self CPU time total: 5.534ms
4046
- Self CUDA time total: 3.710ms
4047
 
4048
 
4049
 
@@ -4053,29 +4053,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L320_bfloat16
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
- torch_flash_ma 4.62% 254.756us 44.14% 2.433ms 2.433ms 0.000us 0.00% 3.774ms 3.774ms 1
4057
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.727ms 100.29% 3.727ms 3.727ms 1
4058
- aten::scaled_dot_product_attention 0.43% 23.830us 3.33% 183.454us 61.151us 0.000us 0.00% 2.942ms 980.796us 3
4059
- aten::_scaled_dot_product_flash_attention 0.32% 17.891us 2.90% 159.624us 53.208us 0.000us 0.00% 2.942ms 980.796us 3
4060
- aten::_flash_attention_forward 0.73% 40.074us 2.20% 121.152us 40.384us 2.942ms 79.17% 2.942ms 980.796us 3
4061
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.942ms 79.17% 2.942ms 980.796us 3
4062
- aten::contiguous 0.16% 8.718us 35.43% 1.953ms 162.745us 0.000us 0.00% 831.581us 69.298us 12
4063
- aten::clone 0.47% 25.749us 35.27% 1.944ms 162.019us 0.000us 0.00% 831.581us 69.298us 12
4064
- aten::copy_ 1.40% 77.041us 33.64% 1.855ms 154.552us 774.142us 20.83% 831.581us 69.298us 12
4065
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 774.142us 20.83% 774.142us 64.512us 12
4066
- Activity Buffer Request 30.83% 1.700ms 30.83% 1.700ms 1.700ms 57.439us 1.55% 57.439us 57.439us 1
4067
- aten::transpose 0.84% 46.360us 1.13% 62.482us 2.603us 0.000us 0.00% 0.000us 0.000us 24
4068
- aten::as_strided 0.29% 16.122us 0.29% 16.122us 0.672us 0.000us 0.00% 0.000us 0.000us 24
4069
- aten::empty_like 0.36% 19.611us 1.53% 84.374us 5.625us 0.000us 0.00% 0.000us 0.000us 15
4070
- aten::empty 1.44% 79.561us 1.44% 79.561us 3.315us 0.000us 0.00% 0.000us 0.000us 24
4071
- cudaLaunchKernel 1.87% 102.913us 1.87% 102.913us 6.861us 0.000us 0.00% 0.000us 0.000us 15
4072
- aten::empty_strided 0.28% 15.330us 0.28% 15.330us 5.110us 0.000us 0.00% 0.000us 0.000us 3
4073
- cudaDeviceGetAttribute 0.03% 1.680us 0.03% 1.680us 0.280us 0.000us 0.00% 0.000us 0.000us 6
4074
- cudaFuncSetAttribute 0.07% 3.840us 0.07% 3.840us 1.280us 0.000us 0.00% 0.000us 0.000us 3
4075
- cudaDeviceSynchronize 55.86% 3.080ms 55.86% 3.080ms 3.080ms 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
- Self CPU time total: 5.513ms
4078
- Self CUDA time total: 3.717ms
4079
 
4080
 
4081
 
@@ -4085,29 +4085,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L384_bfloat16
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- torch_flash_ma 4.28% 249.055us 45.91% 2.672ms 2.672ms 0.000us 0.00% 3.870ms 3.870ms 1
4089
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.822ms 100.28% 3.822ms 3.822ms 1
4090
- aten::scaled_dot_product_attention 0.44% 25.342us 3.23% 187.955us 62.652us 0.000us 0.00% 3.022ms 1.007ms 3
4091
- aten::_scaled_dot_product_flash_attention 0.30% 17.701us 2.79% 162.613us 54.204us 0.000us 0.00% 3.022ms 1.007ms 3
4092
- aten::_flash_attention_forward 0.71% 41.280us 2.11% 122.541us 40.847us 3.022ms 79.29% 3.022ms 1.007ms 3
4093
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.022ms 79.29% 3.022ms 1.007ms 3
4094
- aten::contiguous 0.16% 9.081us 37.65% 2.191ms 182.597us 0.000us 0.00% 847.483us 70.624us 12
4095
- aten::clone 0.47% 27.546us 37.50% 2.182ms 181.840us 0.000us 0.00% 847.483us 70.624us 12
4096
- aten::copy_ 1.40% 81.736us 35.91% 2.090ms 174.156us 789.211us 20.71% 847.483us 70.624us 12
4097
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 789.211us 20.71% 789.211us 65.768us 12
4098
- Activity Buffer Request 29.46% 1.714ms 29.46% 1.714ms 1.714ms 58.272us 1.53% 58.272us 58.272us 1
4099
- aten::transpose 0.83% 48.521us 1.13% 65.981us 2.749us 0.000us 0.00% 0.000us 0.000us 24
4100
- aten::as_strided 0.30% 17.460us 0.30% 17.460us 0.727us 0.000us 0.00% 0.000us 0.000us 24
4101
- aten::empty_like 0.35% 20.461us 1.45% 84.343us 5.623us 0.000us 0.00% 0.000us 0.000us 15
4102
- aten::empty 1.38% 80.070us 1.38% 80.070us 3.336us 0.000us 0.00% 0.000us 0.000us 24
4103
- cudaLaunchKernel 5.47% 318.217us 5.47% 318.217us 21.214us 0.000us 0.00% 0.000us 0.000us 15
4104
- aten::empty_strided 0.25% 14.521us 0.25% 14.521us 4.840us 0.000us 0.00% 0.000us 0.000us 3
4105
- cudaDeviceGetAttribute 0.03% 1.689us 0.03% 1.689us 0.282us 0.000us 0.00% 0.000us 0.000us 6
4106
- cudaFuncSetAttribute 0.08% 4.671us 0.08% 4.671us 1.557us 0.000us 0.00% 0.000us 0.000us 3
4107
- cudaDeviceSynchronize 54.09% 3.147ms 54.09% 3.147ms 3.147ms 0.000us 0.00% 0.000us 0.000us 1
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
- Self CPU time total: 5.819ms
4110
- Self CUDA time total: 3.811ms
4111
 
4112
 
4113
 
@@ -4117,29 +4117,29 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L448_bfloat16
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4119
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4120
- torch_flash_ma 4.79% 300.628us 43.01% 2.699ms 2.699ms 0.000us 0.00% 4.340ms 4.340ms 1
4121
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.290ms 100.25% 4.290ms 4.290ms 1
4122
- aten::scaled_dot_product_attention 0.40% 25.381us 2.96% 185.704us 61.901us 0.000us 0.00% 3.474ms 1.158ms 3
4123
- aten::_scaled_dot_product_flash_attention 0.28% 17.780us 2.55% 160.323us 53.441us 0.000us 0.00% 3.474ms 1.158ms 3
4124
- aten::_flash_attention_forward 0.64% 40.370us 1.93% 121.223us 40.408us 3.474ms 81.17% 3.474ms 1.158ms 3
4125
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.474ms 81.17% 3.474ms 1.158ms 3
4126
- aten::contiguous 0.14% 9.022us 34.56% 2.169ms 180.719us 0.000us 0.00% 866.336us 72.195us 12
4127
- aten::clone 0.44% 27.858us 34.41% 2.160ms 179.967us 0.000us 0.00% 866.336us 72.195us 12
4128
- aten::copy_ 1.24% 77.719us 32.91% 2.066ms 172.130us 806.048us 18.83% 866.336us 72.195us 12
4129
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 806.048us 18.83% 806.048us 67.171us 12
4130
- Activity Buffer Request 27.70% 1.738ms 27.70% 1.738ms 1.738ms 60.288us 1.41% 60.288us 60.288us 1
4131
- aten::transpose 0.77% 48.240us 1.05% 65.650us 2.735us 0.000us 0.00% 0.000us 0.000us 24
4132
- aten::as_strided 0.28% 17.410us 0.28% 17.410us 0.725us 0.000us 0.00% 0.000us 0.000us 24
4133
- aten::empty_like 0.34% 21.363us 1.38% 86.453us 5.764us 0.000us 0.00% 0.000us 0.000us 15
4134
- aten::empty 1.28% 80.561us 1.28% 80.561us 3.357us 0.000us 0.00% 0.000us 0.000us 24
4135
- cudaLaunchKernel 4.36% 273.888us 4.36% 273.888us 18.259us 0.000us 0.00% 0.000us 0.000us 15
4136
- aten::empty_strided 0.24% 14.900us 0.24% 14.900us 4.967us 0.000us 0.00% 0.000us 0.000us 3
4137
- cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6
4138
- cudaFuncSetAttribute 0.07% 4.100us 0.07% 4.100us 1.367us 0.000us 0.00% 0.000us 0.000us 3
4139
- cudaDeviceSynchronize 56.99% 3.576ms 56.99% 3.576ms 3.576ms 0.000us 0.00% 0.000us 0.000us 1
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
- Self CPU time total: 6.275ms
4142
- Self CUDA time total: 4.280ms
4143
 
4144
 
4145
 
@@ -4149,38 +4149,38 @@ PROFILE TRACE: torch_flash_ma | cuda_attn_L512_bfloat16
4149
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4150
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4151
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4152
- torch_flash_ma 4.01% 253.526us 41.16% 2.602ms 2.602ms 0.000us 0.00% 4.429ms 4.429ms 1
4153
- torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.378ms 100.24% 4.378ms 4.378ms 1
4154
- aten::scaled_dot_product_attention 0.38% 23.889us 2.89% 182.483us 60.828us 0.000us 0.00% 3.556ms 1.185ms 3
4155
- aten::_scaled_dot_product_flash_attention 0.27% 17.360us 2.51% 158.594us 52.865us 0.000us 0.00% 3.556ms 1.185ms 3
4156
- aten::_flash_attention_forward 0.66% 42.013us 1.90% 120.422us 40.141us 3.556ms 81.42% 3.556ms 1.185ms 3
4157
- void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.556ms 81.42% 3.556ms 1.185ms 3
4158
- aten::contiguous 0.14% 8.630us 33.58% 2.122ms 176.875us 0.000us 0.00% 872.667us 72.722us 12
4159
- aten::clone 0.41% 26.047us 33.44% 2.114ms 176.156us 0.000us 0.00% 872.667us 72.722us 12
4160
- aten::copy_ 1.25% 79.082us 32.00% 2.023ms 168.597us 811.483us 18.58% 872.667us 72.722us 12
4161
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 811.483us 18.58% 811.483us 67.624us 12
4162
- Activity Buffer Request 26.87% 1.699ms 26.87% 1.699ms 1.699ms 61.184us 1.40% 61.184us 61.184us 1
4163
- aten::transpose 0.75% 47.653us 1.02% 64.533us 2.689us 0.000us 0.00% 0.000us 0.000us 24
4164
- aten::as_strided 0.27% 16.880us 0.27% 16.880us 0.703us 0.000us 0.00% 0.000us 0.000us 24
4165
- aten::empty_like 0.33% 20.879us 1.34% 84.642us 5.643us 0.000us 0.00% 0.000us 0.000us 15
4166
- aten::empty 1.25% 79.031us 1.25% 79.031us 3.293us 0.000us 0.00% 0.000us 0.000us 24
4167
- cudaLaunchKernel 4.24% 268.168us 4.24% 268.168us 17.878us 0.000us 0.00% 0.000us 0.000us 15
4168
- aten::empty_strided 0.23% 14.592us 0.23% 14.592us 4.864us 0.000us 0.00% 0.000us 0.000us 3
4169
- cudaDeviceGetAttribute 0.03% 1.679us 0.03% 1.679us 0.280us 0.000us 0.00% 0.000us 0.000us 6
4170
- cudaFuncSetAttribute 0.06% 3.920us 0.06% 3.920us 1.307us 0.000us 0.00% 0.000us 0.000us 3
4171
- cudaDeviceSynchronize 58.84% 3.719ms 58.84% 3.719ms 3.719ms 0.000us 0.00% 0.000us 0.000us 1
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
- Self CPU time total: 6.322ms
4174
- Self CUDA time total: 4.367ms
4175
 
4176
 
4177
  impl wl p50(ms) ok
4178
  torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4179
- torch_flash_ma cuda_attn_L256_bfloat16 1.25 True
4180
- torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
4181
- torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
4182
- torch_flash_ma cuda_attn_L448_bfloat16 1.45 True
4183
- torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
4184
  </pre></div>
4185
  <div class="cell-artifacts">
4186
  <h4>Artifacts:</h4>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.28s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:02:01 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
 
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 42C P0 86W / 350W | 0MiB / 46068MiB | 20% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 4.27s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3989
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3990
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.630ms 102.63% 3.630ms 3.630ms 1
3993
+ torch_flash_ma 6.38% 352.556us 49.11% 2.714ms 2.714ms 0.000us 0.00% 3.576ms 3.576ms 1
3994
+ aten::scaled_dot_product_attention 0.73% 40.491us 3.98% 220.075us 73.358us 0.000us 0.00% 2.821ms 940.462us 3
3995
+ aten::_scaled_dot_product_flash_attention 0.47% 25.779us 3.25% 179.584us 59.861us 0.000us 0.00% 2.821ms 940.462us 3
3996
+ aten::_flash_attention_forward 0.70% 38.829us 2.35% 129.692us 43.231us 2.821ms 79.77% 2.821ms 940.462us 3
3997
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.821ms 79.77% 2.821ms 940.462us 3
3998
+ aten::contiguous 0.22% 12.191us 37.53% 2.074ms 172.866us 0.000us 0.00% 755.108us 62.926us 12
3999
+ aten::clone 0.60% 33.381us 37.31% 2.062ms 171.850us 0.000us 0.00% 755.108us 62.926us 12
4000
+ aten::copy_ 1.61% 89.181us 35.26% 1.949ms 162.385us 715.299us 20.23% 755.108us 62.926us 12
4001
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 715.299us 20.23% 715.299us 59.608us 12
4002
+ Activity Buffer Request 31.81% 1.758ms 31.81% 1.758ms 1.758ms 39.809us 1.13% 39.809us 39.809us 1
4003
+ aten::transpose 1.21% 66.774us 1.65% 91.006us 3.792us 0.000us 0.00% 0.000us 0.000us 24
4004
+ aten::as_strided 0.44% 24.232us 0.44% 24.232us 1.010us 0.000us 0.00% 0.000us 0.000us 24
4005
+ aten::empty_like 0.44% 24.459us 1.87% 103.512us 6.901us 0.000us 0.00% 0.000us 0.000us 15
4006
+ aten::empty 1.67% 92.213us 1.67% 92.213us 3.842us 0.000us 0.00% 0.000us 0.000us 24
4007
+ cudaLaunchKernel 2.28% 126.282us 2.28% 126.282us 8.419us 0.000us 0.00% 0.000us 0.000us 15
4008
+ aten::empty_strided 0.31% 16.960us 0.31% 16.960us 5.653us 0.000us 0.00% 0.000us 0.000us 3
4009
+ cudaDeviceGetAttribute 0.04% 2.141us 0.04% 2.141us 0.357us 0.000us 0.00% 0.000us 0.000us 6
4010
+ cudaFuncSetAttribute 0.19% 10.441us 0.19% 10.441us 3.480us 0.000us 0.00% 0.000us 0.000us 3
4011
+ cudaDeviceSynchronize 50.89% 2.813ms 50.89% 2.813ms 2.813ms 0.000us 0.00% 0.000us 0.000us 1
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
+ Self CPU time total: 5.527ms
4014
+ Self CUDA time total: 3.537ms
4015
 
4016
 
4017
 
 
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4023
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4024
+ torch_flash_ma 4.56% 252.356us 44.41% 2.457ms 2.457ms 0.000us 0.00% 3.793ms 3.793ms 1
4025
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.748ms 100.29% 3.748ms 3.748ms 1
4026
+ aten::scaled_dot_product_attention 0.44% 24.090us 3.37% 186.293us 62.098us 0.000us 0.00% 2.975ms 991.820us 3
4027
+ aten::_scaled_dot_product_flash_attention 0.34% 18.721us 2.93% 162.203us 54.068us 0.000us 0.00% 2.975ms 991.820us 3
4028
+ aten::_flash_attention_forward 0.77% 42.568us 2.18% 120.522us 40.174us 2.975ms 79.63% 2.975ms 991.820us 3
4029
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 2.975ms 79.63% 2.975ms 991.820us 3
4030
+ aten::contiguous 0.18% 9.899us 35.65% 1.973ms 164.423us 0.000us 0.00% 817.633us 68.136us 12
4031
+ aten::clone 0.53% 29.604us 35.48% 1.963ms 163.598us 0.000us 0.00% 817.633us 68.136us 12
4032
+ aten::copy_ 1.46% 80.732us 33.77% 1.869ms 155.723us 761.377us 20.37% 817.633us 68.136us 12
4033
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 761.377us 20.37% 761.377us 63.448us 12
4034
+ Activity Buffer Request 30.82% 1.705ms 30.82% 1.705ms 1.705ms 56.256us 1.51% 56.256us 56.256us 1
4035
+ aten::transpose 0.91% 50.232us 1.24% 68.680us 2.862us 0.000us 0.00% 0.000us 0.000us 24
4036
+ aten::as_strided 0.33% 18.448us 0.33% 18.448us 0.769us 0.000us 0.00% 0.000us 0.000us 24
4037
+ aten::empty_like 0.42% 23.239us 1.52% 84.240us 5.616us 0.000us 0.00% 0.000us 0.000us 15
4038
+ aten::empty 1.37% 76.011us 1.37% 76.011us 3.167us 0.000us 0.00% 0.000us 0.000us 24
4039
+ cudaLaunchKernel 1.93% 106.693us 1.93% 106.693us 7.113us 0.000us 0.00% 0.000us 0.000us 15
4040
+ aten::empty_strided 0.25% 13.951us 0.25% 13.951us 4.650us 0.000us 0.00% 0.000us 0.000us 3
4041
+ cudaDeviceGetAttribute 0.03% 1.720us 0.03% 1.720us 0.287us 0.000us 0.00% 0.000us 0.000us 6
4042
+ cudaFuncSetAttribute 0.07% 3.701us 0.07% 3.701us 1.234us 0.000us 0.00% 0.000us 0.000us 3
4043
+ cudaDeviceSynchronize 55.59% 3.076ms 55.59% 3.076ms 3.076ms 0.000us 0.00% 0.000us 0.000us 1
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
  Self CPU time total: 5.534ms
4046
+ Self CUDA time total: 3.737ms
4047
 
4048
 
4049
 
 
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4055
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4056
+ torch_flash_ma 4.60% 257.767us 43.91% 2.459ms 2.459ms 0.000us 0.00% 3.868ms 3.868ms 1
4057
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.820ms 100.28% 3.820ms 3.820ms 1
4058
+ aten::scaled_dot_product_attention 0.42% 23.451us 3.31% 185.194us 61.731us 0.000us 0.00% 3.025ms 1.008ms 3
4059
+ aten::_scaled_dot_product_flash_attention 0.35% 19.728us 2.89% 161.743us 53.914us 0.000us 0.00% 3.025ms 1.008ms 3
4060
+ aten::_flash_attention_forward 0.72% 40.171us 2.13% 119.133us 39.711us 3.025ms 79.42% 3.025ms 1.008ms 3
4061
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.025ms 79.42% 3.025ms 1.008ms 3
4062
+ aten::contiguous 0.17% 9.680us 35.16% 1.969ms 164.068us 0.000us 0.00% 843.394us 70.283us 12
4063
+ aten::clone 0.57% 32.118us 34.99% 1.959ms 163.261us 0.000us 0.00% 843.394us 70.283us 12
4064
+ aten::copy_ 1.44% 80.682us 33.24% 1.861ms 155.084us 783.938us 20.58% 843.394us 70.283us 12
4065
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 783.938us 20.58% 783.938us 65.328us 12
4066
+ Activity Buffer Request 30.29% 1.696ms 30.29% 1.696ms 1.696ms 59.456us 1.56% 59.456us 59.456us 1
4067
+ aten::transpose 0.92% 51.272us 1.25% 69.843us 2.910us 0.000us 0.00% 0.000us 0.000us 24
4068
+ aten::as_strided 0.33% 18.571us 0.33% 18.571us 0.774us 0.000us 0.00% 0.000us 0.000us 24
4069
+ aten::empty_like 0.37% 20.823us 1.56% 87.172us 5.811us 0.000us 0.00% 0.000us 0.000us 15
4070
+ aten::empty 1.42% 79.691us 1.42% 79.691us 3.320us 0.000us 0.00% 0.000us 0.000us 24
4071
+ cudaLaunchKernel 1.92% 107.532us 1.92% 107.532us 7.169us 0.000us 0.00% 0.000us 0.000us 15
4072
+ aten::empty_strided 0.28% 15.890us 0.28% 15.890us 5.297us 0.000us 0.00% 0.000us 0.000us 3
4073
+ cudaDeviceGetAttribute 0.03% 1.700us 0.03% 1.700us 0.283us 0.000us 0.00% 0.000us 0.000us 6
4074
+ cudaFuncSetAttribute 0.07% 3.820us 0.07% 3.820us 1.273us 0.000us 0.00% 0.000us 0.000us 3
4075
+ cudaDeviceSynchronize 56.09% 3.140ms 56.09% 3.140ms 3.140ms 0.000us 0.00% 0.000us 0.000us 1
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
+ Self CPU time total: 5.599ms
4078
+ Self CUDA time total: 3.809ms
4079
 
4080
 
4081
 
 
4085
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4086
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ torch_flash_ma 4.31% 257.497us 46.49% 2.779ms 2.779ms 0.000us 0.00% 3.937ms 3.937ms 1
4089
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 3.890ms 100.28% 3.890ms 3.890ms 1
4090
+ aten::scaled_dot_product_attention 0.42% 25.301us 3.15% 188.584us 62.861us 0.000us 0.00% 3.098ms 1.033ms 3
4091
+ aten::_scaled_dot_product_flash_attention 0.34% 20.249us 2.73% 163.283us 54.428us 0.000us 0.00% 3.098ms 1.033ms 3
4092
+ aten::_flash_attention_forward 0.67% 40.000us 1.99% 118.763us 39.588us 3.098ms 79.85% 3.098ms 1.033ms 3
4093
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.098ms 79.85% 3.098ms 1.033ms 3
4094
+ aten::contiguous 0.17% 10.243us 38.20% 2.284ms 190.292us 0.000us 0.00% 838.882us 69.907us 12
4095
+ aten::clone 0.53% 31.478us 38.03% 2.273ms 189.439us 0.000us 0.00% 838.882us 69.907us 12
4096
+ aten::copy_ 1.35% 80.860us 36.38% 2.175ms 181.246us 781.730us 20.15% 838.882us 69.907us 12
4097
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 781.730us 20.15% 781.730us 65.144us 12
4098
+ Activity Buffer Request 30.50% 1.823ms 30.50% 1.823ms 1.823ms 57.152us 1.47% 57.152us 57.152us 1
4099
+ aten::transpose 0.90% 53.920us 1.24% 74.061us 3.086us 0.000us 0.00% 0.000us 0.000us 24
4100
+ aten::as_strided 0.34% 20.141us 0.34% 20.141us 0.839us 0.000us 0.00% 0.000us 0.000us 24
4101
+ aten::empty_like 0.36% 21.362us 1.47% 87.614us 5.841us 0.000us 0.00% 0.000us 0.000us 15
4102
+ aten::empty 1.34% 79.813us 1.34% 79.813us 3.326us 0.000us 0.00% 0.000us 0.000us 24
4103
+ cudaLaunchKernel 4.91% 293.806us 4.91% 293.806us 19.587us 0.000us 0.00% 0.000us 0.000us 15
4104
+ aten::empty_strided 0.26% 15.670us 0.26% 15.670us 5.223us 0.000us 0.00% 0.000us 0.000us 3
4105
+ cudaDeviceGetAttribute 0.03% 1.659us 0.03% 1.659us 0.276us 0.000us 0.00% 0.000us 0.000us 6
4106
+ cudaFuncSetAttribute 0.07% 3.921us 0.07% 3.921us 1.307us 0.000us 0.00% 0.000us 0.000us 3
4107
+ cudaDeviceSynchronize 53.51% 3.199ms 53.51% 3.199ms 3.199ms 0.000us 0.00% 0.000us 0.000us 1
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
+ Self CPU time total: 5.978ms
4110
+ Self CUDA time total: 3.880ms
4111
 
4112
 
4113
 
 
4117
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4118
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4119
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4120
+ torch_flash_ma 4.81% 305.765us 42.59% 2.710ms 2.710ms 0.000us 0.00% 4.451ms 4.451ms 1
4121
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.400ms 100.25% 4.400ms 4.400ms 1
4122
+ aten::scaled_dot_product_attention 0.38% 24.020us 2.97% 188.924us 62.975us 0.000us 0.00% 3.579ms 1.193ms 3
4123
+ aten::_scaled_dot_product_flash_attention 0.31% 19.571us 2.59% 164.904us 54.968us 0.000us 0.00% 3.579ms 1.193ms 3
4124
+ aten::_flash_attention_forward 0.68% 43.108us 1.92% 122.012us 40.671us 3.579ms 81.54% 3.579ms 1.193ms 3
4125
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.579ms 81.54% 3.579ms 1.193ms 3
4126
+ aten::contiguous 0.15% 9.589us 34.07% 2.168ms 180.670us 0.000us 0.00% 871.616us 72.635us 12
4127
+ aten::clone 0.54% 34.360us 33.92% 2.158ms 179.871us 0.000us 0.00% 871.616us 72.635us 12
4128
+ aten::copy_ 1.33% 84.914us 32.32% 2.057ms 171.390us 810.495us 18.46% 871.616us 72.635us 12
4129
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 810.495us 18.46% 810.495us 67.541us 12
4130
+ Activity Buffer Request 26.57% 1.691ms 26.57% 1.691ms 1.691ms 61.121us 1.39% 61.121us 61.121us 1
4131
+ aten::transpose 0.82% 51.874us 1.12% 70.963us 2.957us 0.000us 0.00% 0.000us 0.000us 24
4132
+ aten::as_strided 0.30% 19.089us 0.30% 19.089us 0.795us 0.000us 0.00% 0.000us 0.000us 24
4133
+ aten::empty_like 0.34% 21.431us 1.39% 88.502us 5.900us 0.000us 0.00% 0.000us 0.000us 15
4134
+ aten::empty 1.27% 80.674us 1.27% 80.674us 3.361us 0.000us 0.00% 0.000us 0.000us 24
4135
+ cudaLaunchKernel 4.78% 304.046us 4.78% 304.046us 20.270us 0.000us 0.00% 0.000us 0.000us 15
4136
+ aten::empty_strided 0.25% 15.780us 0.25% 15.780us 5.260us 0.000us 0.00% 0.000us 0.000us 3
4137
+ cudaDeviceGetAttribute 0.02% 1.550us 0.02% 1.550us 0.258us 0.000us 0.00% 0.000us 0.000us 6
4138
+ cudaFuncSetAttribute 0.06% 3.750us 0.06% 3.750us 1.250us 0.000us 0.00% 0.000us 0.000us 3
4139
+ cudaDeviceSynchronize 57.41% 3.653ms 57.41% 3.653ms 3.653ms 0.000us 0.00% 0.000us 0.000us 1
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
+ Self CPU time total: 6.363ms
4142
+ Self CUDA time total: 4.389ms
4143
 
4144
 
4145
 
 
4149
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4150
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4151
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4152
+ torch_flash_ma 3.57% 230.352us 40.90% 2.641ms 2.641ms 0.000us 0.00% 4.540ms 4.540ms 1
4153
+ torch_flash_ma 0.00% 0.000us 0.00% 0.000us 0.000us 4.489ms 100.24% 4.489ms 4.489ms 1
4154
+ aten::scaled_dot_product_attention 0.38% 24.551us 2.77% 178.785us 59.595us 0.000us 0.00% 3.667ms 1.222ms 3
4155
+ aten::_scaled_dot_product_flash_attention 0.30% 19.129us 2.39% 154.234us 51.411us 0.000us 0.00% 3.667ms 1.222ms 3
4156
+ aten::_flash_attention_forward 0.55% 35.197us 1.71% 110.631us 36.877us 3.667ms 81.88% 3.667ms 1.222ms 3
4157
+ void pytorch_flash::flash_fwd_kernel&lt;Flash_fwd_kerne... 0.00% 0.000us 0.00% 0.000us 0.000us 3.667ms 81.88% 3.667ms 1.222ms 3
4158
+ aten::contiguous 0.16% 10.271us 33.78% 2.181ms 181.772us 0.000us 0.00% 873.057us 72.755us 12
4159
+ aten::clone 0.44% 28.652us 33.62% 2.171ms 180.916us 0.000us 0.00% 873.057us 72.755us 12
4160
+ aten::copy_ 1.30% 83.713us 32.19% 2.078ms 173.208us 811.457us 18.12% 873.057us 72.755us 12
4161
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 811.457us 18.12% 811.457us 67.621us 12
4162
+ Activity Buffer Request 26.83% 1.733ms 26.83% 1.733ms 1.733ms 61.600us 1.38% 61.600us 61.600us 1
4163
+ aten::transpose 0.85% 55.123us 1.17% 75.416us 3.142us 0.000us 0.00% 0.000us 0.000us 24
4164
+ aten::as_strided 0.31% 20.293us 0.31% 20.293us 0.846us 0.000us 0.00% 0.000us 0.000us 24
4165
+ aten::empty_like 0.30% 19.350us 1.29% 83.431us 5.562us 0.000us 0.00% 0.000us 0.000us 15
4166
+ aten::empty 1.21% 78.153us 1.21% 78.153us 3.256us 0.000us 0.00% 0.000us 0.000us 24
4167
+ cudaLaunchKernel 4.40% 284.286us 4.40% 284.286us 18.952us 0.000us 0.00% 0.000us 0.000us 15
4168
+ aten::empty_strided 0.23% 14.650us 0.23% 14.650us 4.883us 0.000us 0.00% 0.000us 0.000us 3
4169
+ cudaDeviceGetAttribute 0.03% 1.640us 0.03% 1.640us 0.273us 0.000us 0.00% 0.000us 0.000us 6
4170
+ cudaFuncSetAttribute 0.05% 3.450us 0.05% 3.450us 1.150us 0.000us 0.00% 0.000us 0.000us 3
4171
+ cudaDeviceSynchronize 59.10% 3.816ms 59.10% 3.816ms 3.816ms 0.000us 0.00% 0.000us 0.000us 1
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
+ Self CPU time total: 6.458ms
4174
+ Self CUDA time total: 4.478ms
4175
 
4176
 
4177
  impl wl p50(ms) ok
4178
  torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4179
+ torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
4180
+ torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
4181
+ torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
4182
+ torch_flash_ma cuda_attn_L448_bfloat16 1.49 True
4183
+ torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
4184
  </pre></div>
4185
  <div class="cell-artifacts">
4186
  <h4>Artifacts:</h4>
flash_attn/impls/hf_kernels_flash_attn.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 5.91s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3943,21 +3943,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L128_bfloat16
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3945
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3946
- hf_kernels_flash_attn 3.35% 155.232us 44.97% 2.082ms 2.082ms 0.000us 0.00% 3.704ms 3.704ms 1
3947
- _flash_attn_9e27194::fwd 1.43% 66.152us 41.62% 1.927ms 642.264us 2.766ms 100.00% 3.704ms 1.235ms 3
3948
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.768ms 100.06% 2.768ms 2.768ms 1
3949
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.766ms 100.00% 2.766ms 922.153us 3
3950
- Activity Buffer Request 37.12% 1.719ms 37.12% 1.719ms 1.719ms 937.630us 33.89% 937.630us 937.630us 1
3951
- cudaDeviceGetAttribute 0.12% 5.360us 0.12% 5.360us 0.357us 0.000us 0.00% 0.000us 0.000us 15
3952
- aten::empty_like 0.39% 18.222us 1.18% 54.592us 18.197us 0.000us 0.00% 0.000us 0.000us 3
3953
- aten::empty_strided 0.79% 36.370us 0.79% 36.370us 12.123us 0.000us 0.00% 0.000us 0.000us 3
3954
- aten::empty 0.56% 25.741us 0.56% 25.741us 2.860us 0.000us 0.00% 0.000us 0.000us 9
3955
- cudaFuncSetAttribute 0.30% 13.770us 0.30% 13.770us 4.590us 0.000us 0.00% 0.000us 0.000us 3
3956
- cudaLaunchKernel 0.92% 42.401us 0.92% 42.401us 14.134us 0.000us 0.00% 0.000us 0.000us 3
3957
- cudaDeviceSynchronize 55.03% 2.548ms 55.03% 2.548ms 2.548ms 0.000us 0.00% 0.000us 0.000us 1
3958
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3959
- Self CPU time total: 4.630ms
3960
- Self CUDA time total: 2.766ms
3961
 
3962
 
3963
 
@@ -3967,21 +3967,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L256_bfloat16
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
- hf_kernels_flash_attn 1.95% 91.533us 41.78% 1.962ms 1.962ms 0.000us 0.00% 3.856ms 3.856ms 1
3971
- _flash_attn_9e27194::fwd 1.04% 49.050us 39.83% 1.870ms 623.350us 2.882ms 100.00% 3.856ms 1.285ms 3
3972
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.884ms 100.05% 2.884ms 2.884ms 1
3973
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.882ms 100.00% 2.882ms 960.764us 3
3974
- Activity Buffer Request 36.88% 1.732ms 36.88% 1.732ms 1.732ms 973.756us 33.78% 973.756us 973.756us 1
3975
- cudaDeviceGetAttribute 0.09% 4.030us 0.09% 4.030us 0.269us 0.000us 0.00% 0.000us 0.000us 15
3976
- aten::empty_like 0.18% 8.460us 0.61% 28.490us 9.497us 0.000us 0.00% 0.000us 0.000us 3
3977
- aten::empty_strided 0.43% 20.030us 0.43% 20.030us 6.677us 0.000us 0.00% 0.000us 0.000us 3
3978
- aten::empty 0.55% 25.961us 0.55% 25.961us 2.885us 0.000us 0.00% 0.000us 0.000us 9
3979
- cudaFuncSetAttribute 0.08% 3.700us 0.08% 3.700us 1.233us 0.000us 0.00% 0.000us 0.000us 3
3980
- cudaLaunchKernel 0.58% 27.091us 0.58% 27.091us 9.030us 0.000us 0.00% 0.000us 0.000us 3
3981
- cudaDeviceSynchronize 58.22% 2.734ms 58.22% 2.734ms 2.734ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
- Self CPU time total: 4.695ms
3984
- Self CUDA time total: 2.882ms
3985
 
3986
 
3987
 
@@ -3991,21 +3991,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L320_bfloat16
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
- hf_kernels_flash_attn 2.18% 107.861us 40.50% 2.008ms 2.008ms 0.000us 0.00% 4.125ms 4.125ms 1
3995
- _flash_attn_9e27194::fwd 0.99% 48.872us 38.32% 1.900ms 633.314us 3.094ms 100.00% 4.125ms 1.375ms 3
3996
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.095ms 100.05% 3.095ms 3.095ms 1
3997
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.094ms 100.00% 3.094ms 1.031ms 3
3998
- Activity Buffer Request 35.67% 1.768ms 35.67% 1.768ms 1.768ms 1.032ms 33.35% 1.032ms 1.032ms 1
3999
- cudaDeviceGetAttribute 0.09% 4.480us 0.09% 4.480us 0.299us 0.000us 0.00% 0.000us 0.000us 15
4000
- aten::empty_like 0.13% 6.580us 0.45% 22.520us 7.507us 0.000us 0.00% 0.000us 0.000us 3
4001
- aten::empty_strided 0.32% 15.940us 0.32% 15.940us 5.313us 0.000us 0.00% 0.000us 0.000us 3
4002
- aten::empty 0.47% 23.250us 0.47% 23.250us 2.583us 0.000us 0.00% 0.000us 0.000us 9
4003
- cudaFuncSetAttribute 0.08% 3.791us 0.08% 3.791us 1.264us 0.000us 0.00% 0.000us 0.000us 3
4004
- cudaLaunchKernel 0.58% 28.541us 0.58% 28.541us 9.514us 0.000us 0.00% 0.000us 0.000us 3
4005
- cudaDeviceSynchronize 59.50% 2.950ms 59.50% 2.950ms 2.950ms 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
- Self CPU time total: 4.958ms
4008
- Self CUDA time total: 3.094ms
4009
 
4010
 
4011
 
@@ -4015,21 +4015,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L384_bfloat16
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
- hf_kernels_flash_attn 2.18% 109.362us 41.99% 2.109ms 2.109ms 0.000us 0.00% 4.102ms 4.102ms 1
4019
- _flash_attn_9e27194::fwd 1.01% 50.650us 39.81% 1.999ms 666.498us 3.061ms 100.00% 4.102ms 1.367ms 3
4020
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.062ms 100.05% 3.062ms 3.062ms 1
4021
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.061ms 100.00% 3.061ms 1.020ms 3
4022
- Activity Buffer Request 33.41% 1.678ms 33.41% 1.678ms 1.678ms 1.041ms 34.02% 1.041ms 1.041ms 1
4023
- cudaDeviceGetAttribute 0.08% 4.070us 0.08% 4.070us 0.271us 0.000us 0.00% 0.000us 0.000us 15
4024
- aten::empty_like 0.14% 6.851us 0.49% 24.381us 8.127us 0.000us 0.00% 0.000us 0.000us 3
4025
- aten::empty_strided 0.35% 17.530us 0.35% 17.530us 5.843us 0.000us 0.00% 0.000us 0.000us 3
4026
- aten::empty 0.44% 22.140us 0.44% 22.140us 2.460us 0.000us 0.00% 0.000us 0.000us 9
4027
- cudaFuncSetAttribute 0.08% 3.810us 0.08% 3.810us 1.270us 0.000us 0.00% 0.000us 0.000us 3
4028
- cudaLaunchKernel 4.31% 216.396us 4.31% 216.396us 72.132us 0.000us 0.00% 0.000us 0.000us 3
4029
- cudaDeviceSynchronize 58.01% 2.914ms 58.01% 2.914ms 2.914ms 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
- Self CPU time total: 5.023ms
4032
- Self CUDA time total: 3.061ms
4033
 
4034
 
4035
 
@@ -4039,21 +4039,21 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L448_bfloat16
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
- hf_kernels_flash_attn 1.91% 108.693us 38.60% 2.193ms 2.193ms 0.000us 0.00% 4.850ms 4.850ms 1
4043
- _flash_attn_9e27194::fwd 0.87% 49.481us 36.69% 2.084ms 694.644us 3.635ms 100.00% 4.850ms 1.617ms 3
4044
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.637ms 100.05% 3.637ms 3.637ms 1
4045
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.635ms 100.00% 3.635ms 1.212ms 3
4046
- Activity Buffer Request 31.43% 1.785ms 31.43% 1.785ms 1.785ms 1.215ms 33.41% 1.215ms 1.215ms 1
4047
- cudaDeviceGetAttribute 0.07% 3.761us 0.07% 3.761us 0.251us 0.000us 0.00% 0.000us 0.000us 15
4048
- aten::empty_like 0.12% 6.970us 0.43% 24.340us 8.113us 0.000us 0.00% 0.000us 0.000us 3
4049
- aten::empty_strided 0.31% 17.370us 0.31% 17.370us 5.790us 0.000us 0.00% 0.000us 0.000us 3
4050
- aten::empty 0.43% 24.270us 0.43% 24.270us 2.697us 0.000us 0.00% 0.000us 0.000us 9
4051
- cudaFuncSetAttribute 0.07% 3.730us 0.07% 3.730us 1.243us 0.000us 0.00% 0.000us 0.000us 3
4052
- cudaLaunchKernel 3.40% 193.224us 3.40% 193.224us 64.408us 0.000us 0.00% 0.000us 0.000us 3
4053
- cudaDeviceSynchronize 61.40% 3.487ms 61.40% 3.487ms 3.487ms 0.000us 0.00% 0.000us 0.000us 1
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
- Self CPU time total: 5.680ms
4056
- Self CUDA time total: 3.635ms
4057
 
4058
 
4059
 
@@ -4063,36 +4063,36 @@ PROFILE TRACE: hf_kernels_flash_attn | cuda_attn_L512_bfloat16
4063
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4064
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
- hf_kernels_flash_attn 1.90% 106.201us 36.85% 2.064ms 2.064ms 0.000us 0.00% 4.915ms 4.915ms 1
4067
- _flash_attn_9e27194::fwd 0.89% 50.062us 34.96% 1.958ms 652.751us 3.682ms 100.00% 4.915ms 1.638ms 3
4068
- hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.684ms 100.05% 3.684ms 3.684ms 1
4069
- void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.682ms 100.00% 3.682ms 1.227ms 3
4070
- Activity Buffer Request 29.73% 1.666ms 29.73% 1.666ms 1.666ms 1.233ms 33.48% 1.233ms 1.233ms 1
4071
- cudaDeviceGetAttribute 0.07% 4.189us 0.07% 4.189us 0.279us 0.000us 0.00% 0.000us 0.000us 15
4072
- aten::empty_like 0.12% 6.851us 0.45% 25.301us 8.434us 0.000us 0.00% 0.000us 0.000us 3
4073
- aten::empty_strided 0.33% 18.450us 0.33% 18.450us 6.150us 0.000us 0.00% 0.000us 0.000us 3
4074
- aten::empty 0.40% 22.632us 0.40% 22.632us 2.515us 0.000us 0.00% 0.000us 0.000us 9
4075
- cudaFuncSetAttribute 0.07% 3.850us 0.07% 3.850us 1.283us 0.000us 0.00% 0.000us 0.000us 3
4076
- cudaLaunchKernel 3.33% 186.623us 3.33% 186.623us 62.208us 0.000us 0.00% 0.000us 0.000us 3
4077
- cudaDeviceSynchronize 63.15% 3.537ms 63.15% 3.537ms 3.537ms 0.000us 0.00% 0.000us 0.000us 1
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
- Self CPU time total: 5.602ms
4080
- Self CUDA time total: 3.682ms
4081
 
4082
 
4083
  impl wl p50(ms) ok
4084
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4085
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
4086
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True
4087
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4088
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
4089
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.24 True
4090
  </pre></div>
4091
  <div class="cell-stderr">
4092
  Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4093
 
4094
- Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:15, 1.16it/s]
4095
- Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 11.63it/s]
4096
  </div>
4097
  <div class="cell-artifacts">
4098
  <h4>Artifacts:</h4>
 
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 5.83s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3945
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3946
+ hf_kernels_flash_attn 3.32% 153.894us 44.44% 2.062ms 2.062ms 0.000us 0.00% 3.741ms 3.741ms 1
3947
+ _flash_attn_9e27194::fwd 1.40% 65.047us 41.12% 1.908ms 636.067us 2.793ms 100.00% 3.741ms 1.247ms 3
3948
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.05% 2.795ms 2.795ms 1
3949
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.793ms 100.00% 2.793ms 931.053us 3
3950
+ Activity Buffer Request 36.76% 1.706ms 36.76% 1.706ms 1.706ms 947.811us 33.93% 947.811us 947.811us 1
3951
+ cudaDeviceGetAttribute 0.09% 4.281us 0.09% 4.281us 0.285us 0.000us 0.00% 0.000us 0.000us 15
3952
+ aten::empty_like 0.44% 20.280us 1.17% 54.161us 18.054us 0.000us 0.00% 0.000us 0.000us 3
3953
+ aten::empty_strided 0.73% 33.881us 0.73% 33.881us 11.294us 0.000us 0.00% 0.000us 0.000us 3
3954
+ aten::empty 0.53% 24.740us 0.53% 24.740us 2.749us 0.000us 0.00% 0.000us 0.000us 9
3955
+ cudaFuncSetAttribute 0.29% 13.452us 0.29% 13.452us 4.484us 0.000us 0.00% 0.000us 0.000us 3
3956
+ cudaLaunchKernel 0.87% 40.582us 0.87% 40.582us 13.527us 0.000us 0.00% 0.000us 0.000us 3
3957
+ cudaDeviceSynchronize 55.56% 2.579ms 55.56% 2.579ms 2.579ms 0.000us 0.00% 0.000us 0.000us 1
3958
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3959
+ Self CPU time total: 4.641ms
3960
+ Self CUDA time total: 2.793ms
3961
 
3962
 
3963
 
 
3967
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3968
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3969
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3970
+ hf_kernels_flash_attn 1.87% 88.452us 41.15% 1.950ms 1.950ms 0.000us 0.00% 3.925ms 3.925ms 1
3971
+ _flash_attn_9e27194::fwd 0.93% 44.030us 39.28% 1.861ms 620.420us 2.932ms 100.00% 3.925ms 1.308ms 3
3972
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 2.933ms 100.05% 2.933ms 2.933ms 1
3973
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.00% 2.932ms 977.209us 3
3974
+ Activity Buffer Request 36.67% 1.738ms 36.67% 1.738ms 1.738ms 993.604us 33.89% 993.604us 993.604us 1
3975
+ cudaDeviceGetAttribute 0.08% 3.589us 0.08% 3.589us 0.239us 0.000us 0.00% 0.000us 0.000us 15
3976
+ aten::empty_like 0.16% 7.361us 0.48% 22.851us 7.617us 0.000us 0.00% 0.000us 0.000us 3
3977
+ aten::empty_strided 0.33% 15.490us 0.33% 15.490us 5.163us 0.000us 0.00% 0.000us 0.000us 3
3978
+ aten::empty 0.44% 21.020us 0.44% 21.020us 2.336us 0.000us 0.00% 0.000us 0.000us 9
3979
+ cudaFuncSetAttribute 0.07% 3.450us 0.07% 3.450us 1.150us 0.000us 0.00% 0.000us 0.000us 3
3980
+ cudaLaunchKernel 0.60% 28.443us 0.60% 28.443us 9.481us 0.000us 0.00% 0.000us 0.000us 3
3981
+ cudaDeviceSynchronize 58.85% 2.789ms 58.85% 2.789ms 2.789ms 0.000us 0.00% 0.000us 0.000us 1
3982
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3983
+ Self CPU time total: 4.739ms
3984
+ Self CUDA time total: 2.932ms
3985
 
3986
 
3987
 
 
3991
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3992
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3993
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3994
+ hf_kernels_flash_attn 2.16% 105.271us 40.16% 1.954ms 1.954ms 0.000us 0.00% 4.088ms 4.088ms 1
3995
+ _flash_attn_9e27194::fwd 0.92% 44.671us 38.00% 1.849ms 616.384us 3.054ms 100.00% 4.088ms 1.363ms 3
3996
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.056ms 100.05% 3.056ms 3.056ms 1
3997
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.054ms 100.00% 3.054ms 1.018ms 3
3998
+ Activity Buffer Request 35.43% 1.724ms 35.43% 1.724ms 1.724ms 1.034ms 33.84% 1.034ms 1.034ms 1
3999
+ cudaDeviceGetAttribute 0.08% 3.741us 0.08% 3.741us 0.249us 0.000us 0.00% 0.000us 0.000us 15
4000
+ aten::empty_like 0.15% 7.380us 0.46% 22.580us 7.527us 0.000us 0.00% 0.000us 0.000us 3
4001
+ aten::empty_strided 0.31% 15.200us 0.31% 15.200us 5.067us 0.000us 0.00% 0.000us 0.000us 3
4002
+ aten::empty 0.43% 20.900us 0.43% 20.900us 2.322us 0.000us 0.00% 0.000us 0.000us 9
4003
+ cudaFuncSetAttribute 0.07% 3.441us 0.07% 3.441us 1.147us 0.000us 0.00% 0.000us 0.000us 3
4004
+ cudaLaunchKernel 0.61% 29.670us 0.61% 29.670us 9.890us 0.000us 0.00% 0.000us 0.000us 3
4005
+ cudaDeviceSynchronize 59.84% 2.912ms 59.84% 2.912ms 2.912ms 0.000us 0.00% 0.000us 0.000us 1
4006
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4007
+ Self CPU time total: 4.867ms
4008
+ Self CUDA time total: 3.054ms
4009
 
4010
 
4011
 
 
4015
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4016
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4017
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4018
+ hf_kernels_flash_attn 1.99% 101.304us 41.40% 2.105ms 2.105ms 0.000us 0.00% 4.182ms 4.182ms 1
4019
+ _flash_attn_9e27194::fwd 0.90% 45.720us 39.41% 2.004ms 667.947us 3.124ms 100.00% 4.182ms 1.394ms 3
4020
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.125ms 100.05% 3.125ms 3.125ms 1
4021
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.124ms 100.00% 3.124ms 1.041ms 3
4022
+ Activity Buffer Request 33.36% 1.696ms 33.36% 1.696ms 1.696ms 1.058ms 33.87% 1.058ms 1.058ms 1
4023
+ cudaDeviceGetAttribute 0.07% 3.650us 0.07% 3.650us 0.243us 0.000us 0.00% 0.000us 0.000us 15
4024
+ aten::empty_like 0.15% 7.421us 0.48% 24.201us 8.067us 0.000us 0.00% 0.000us 0.000us 3
4025
+ aten::empty_strided 0.33% 16.780us 0.33% 16.780us 5.593us 0.000us 0.00% 0.000us 0.000us 3
4026
+ aten::empty 0.42% 21.431us 0.42% 21.431us 2.381us 0.000us 0.00% 0.000us 0.000us 9
4027
+ cudaFuncSetAttribute 0.08% 4.070us 0.08% 4.070us 1.357us 0.000us 0.00% 0.000us 0.000us 3
4028
+ cudaLaunchKernel 4.10% 208.474us 4.10% 208.474us 69.491us 0.000us 0.00% 0.000us 0.000us 3
4029
+ cudaDeviceSynchronize 58.60% 2.980ms 58.60% 2.980ms 2.980ms 0.000us 0.00% 0.000us 0.000us 1
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
+ Self CPU time total: 5.085ms
4032
+ Self CUDA time total: 3.124ms
4033
 
4034
 
4035
 
 
4039
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4040
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4041
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4042
+ hf_kernels_flash_attn 1.92% 106.253us 37.17% 2.059ms 2.059ms 0.000us 0.00% 4.843ms 4.843ms 1
4043
+ _flash_attn_9e27194::fwd 0.86% 47.751us 35.25% 1.953ms 651.011us 3.628ms 100.00% 4.843ms 1.614ms 3
4044
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.629ms 100.04% 3.629ms 3.629ms 1
4045
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.628ms 100.00% 3.628ms 1.209ms 3
4046
+ Activity Buffer Request 30.14% 1.670ms 30.14% 1.670ms 1.670ms 1.215ms 33.50% 1.215ms 1.215ms 1
4047
+ cudaDeviceGetAttribute 0.07% 3.881us 0.07% 3.881us 0.259us 0.000us 0.00% 0.000us 0.000us 15
4048
+ aten::empty_like 0.14% 7.581us 0.43% 24.021us 8.007us 0.000us 0.00% 0.000us 0.000us 3
4049
+ aten::empty_strided 0.30% 16.440us 0.30% 16.440us 5.480us 0.000us 0.00% 0.000us 0.000us 3
4050
+ aten::empty 0.39% 21.710us 0.39% 21.710us 2.412us 0.000us 0.00% 0.000us 0.000us 9
4051
+ cudaFuncSetAttribute 0.07% 3.650us 0.07% 3.650us 1.217us 0.000us 0.00% 0.000us 0.000us 3
4052
+ cudaLaunchKernel 3.29% 182.154us 3.29% 182.154us 60.718us 0.000us 0.00% 0.000us 0.000us 3
4053
+ cudaDeviceSynchronize 62.83% 3.482ms 62.83% 3.482ms 3.482ms 0.000us 0.00% 0.000us 0.000us 1
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
+ Self CPU time total: 5.541ms
4056
+ Self CUDA time total: 3.628ms
4057
 
4058
 
4059
 
 
4063
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4064
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
+ hf_kernels_flash_attn 1.86% 105.712us 36.76% 2.092ms 2.092ms 0.000us 0.00% 4.990ms 4.990ms 1
4067
+ _flash_attn_9e27194::fwd 0.87% 49.631us 34.91% 1.986ms 661.968us 3.741ms 100.00% 4.990ms 1.663ms 3
4068
+ hf_kernels_flash_attn 0.00% 0.000us 0.00% 0.000us 0.000us 3.743ms 100.05% 3.743ms 3.743ms 1
4069
+ void flash::flash_fwd_kernel&lt;Flash_fwd_kernel_traits... 0.00% 0.000us 0.00% 0.000us 0.000us 3.741ms 100.00% 3.741ms 1.247ms 3
4070
+ Activity Buffer Request 29.90% 1.701ms 29.90% 1.701ms 1.701ms 1.249ms 33.38% 1.249ms 1.249ms 1
4071
+ cudaDeviceGetAttribute 0.06% 3.600us 0.06% 3.600us 0.240us 0.000us 0.00% 0.000us 0.000us 15
4072
+ aten::empty_like 0.14% 7.780us 0.42% 24.150us 8.050us 0.000us 0.00% 0.000us 0.000us 3
4073
+ aten::empty_strided 0.29% 16.370us 0.29% 16.370us 5.457us 0.000us 0.00% 0.000us 0.000us 3
4074
+ aten::empty 0.38% 21.420us 0.38% 21.420us 2.380us 0.000us 0.00% 0.000us 0.000us 9
4075
+ cudaFuncSetAttribute 0.06% 3.580us 0.06% 3.580us 1.193us 0.000us 0.00% 0.000us 0.000us 3
4076
+ cudaLaunchKernel 3.20% 182.154us 3.20% 182.154us 60.718us 0.000us 0.00% 0.000us 0.000us 3
4077
+ cudaDeviceSynchronize 63.24% 3.598ms 63.24% 3.598ms 3.598ms 0.000us 0.00% 0.000us 0.000us 1
4078
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4079
+ Self CPU time total: 5.689ms
4080
+ Self CUDA time total: 3.741ms
4081
 
4082
 
4083
  impl wl p50(ms) ok
4084
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
4085
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
4086
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4087
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
4088
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
4089
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.25 True
4090
  </pre></div>
4091
  <div class="cell-stderr">
4092
  Fetching 20 files: 0%| | 0/20 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4093
 
4094
+ Fetching 20 files: 10%|█ | 2/20 [00:01&lt;00:14, 1.28it/s]
4095
+ Fetching 20 files: 100%|██████████| 20/20 [00:01&lt;00:00, 12.76it/s]
4096
  </div>
4097
  <div class="cell-artifacts">
4098
  <h4>Artifacts:</h4>
flash_attn/impls/hf_kernels_flash_attn3.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 6.33s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3942,19 +3942,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L128_bfloat16
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3944
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3945
- hf_kernels_flash_attn3 3.59% 165.413us 48.47% 2.234ms 2.234ms 0.000us 0.00% 3.561ms 3.561ms 1
3946
- FlashAttnFunc 2.69% 124.054us 44.88% 2.069ms 689.509us 0.000us 0.00% 3.561ms 1.187ms 3
3947
- _flash_attn3_1d39a44::fwd 1.63% 74.991us 42.19% 1.944ms 648.158us 2.673ms 100.00% 3.561ms 1.187ms 3
3948
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.674ms 100.05% 2.674ms 2.674ms 1
3949
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.673ms 100.00% 2.673ms 890.896us 3
3950
- Activity Buffer Request 38.25% 1.763ms 38.25% 1.763ms 1.763ms 888.250us 33.23% 888.250us 888.250us 1
3951
- aten::empty 0.95% 43.951us 0.95% 43.951us 7.325us 0.000us 0.00% 0.000us 0.000us 6
3952
- cudaFuncSetAttribute 0.32% 14.620us 0.32% 14.620us 4.873us 0.000us 0.00% 0.000us 0.000us 3
3953
- cudaLaunchKernel 1.04% 47.991us 1.04% 47.991us 15.997us 0.000us 0.00% 0.000us 0.000us 3
3954
- cudaDeviceSynchronize 51.53% 2.375ms 51.53% 2.375ms 2.375ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
- Self CPU time total: 4.609ms
3957
- Self CUDA time total: 2.673ms
3958
 
3959
 
3960
 
@@ -3964,19 +3964,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L256_bfloat16
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
- hf_kernels_flash_attn3 2.68% 124.013us 44.92% 2.080ms 2.080ms 0.000us 0.00% 3.716ms 3.716ms 1
3968
- FlashAttnFunc 1.96% 90.863us 42.24% 1.956ms 652.078us 0.000us 0.00% 3.716ms 1.239ms 3
3969
- _flash_attn3_1d39a44::fwd 1.06% 49.109us 40.28% 1.865ms 621.790us 2.770ms 100.00% 3.716ms 1.239ms 3
3970
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.772ms 100.05% 2.772ms 2.772ms 1
3971
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.770ms 100.00% 2.770ms 923.461us 3
3972
- Activity Buffer Request 37.83% 1.752ms 37.83% 1.752ms 1.752ms 945.210us 34.12% 945.210us 945.210us 1
3973
- aten::empty 0.60% 27.931us 0.60% 27.931us 4.655us 0.000us 0.00% 0.000us 0.000us 6
3974
- cudaFuncSetAttribute 0.12% 5.520us 0.12% 5.520us 1.840us 0.000us 0.00% 0.000us 0.000us 3
3975
- cudaLaunchKernel 0.67% 30.831us 0.67% 30.831us 10.277us 0.000us 0.00% 0.000us 0.000us 3
3976
- cudaDeviceSynchronize 55.08% 2.551ms 55.08% 2.551ms 2.551ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
- Self CPU time total: 4.631ms
3979
- Self CUDA time total: 2.770ms
3980
 
3981
 
3982
 
@@ -3986,19 +3986,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L320_bfloat16
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
- hf_kernels_flash_attn3 2.68% 125.914us 44.02% 2.072ms 2.072ms 0.000us 0.00% 3.816ms 3.816ms 1
3990
- FlashAttnFunc 1.89% 89.112us 41.34% 1.946ms 648.608us 0.000us 0.00% 3.816ms 1.272ms 3
3991
- _flash_attn3_1d39a44::fwd 1.01% 47.500us 39.45% 1.857ms 618.904us 2.847ms 100.00% 3.816ms 1.272ms 3
3992
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.849ms 100.05% 2.849ms 2.849ms 1
3993
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.847ms 100.00% 2.847ms 949.087us 3
3994
- Activity Buffer Request 37.07% 1.745ms 37.07% 1.745ms 1.745ms 968.895us 34.03% 968.895us 968.895us 1
3995
- aten::empty 0.58% 27.171us 0.58% 27.171us 4.529us 0.000us 0.00% 0.000us 0.000us 6
3996
- cudaFuncSetAttribute 0.12% 5.621us 0.12% 5.621us 1.874us 0.000us 0.00% 0.000us 0.000us 3
3997
- cudaLaunchKernel 0.67% 31.690us 0.67% 31.690us 10.563us 0.000us 0.00% 0.000us 0.000us 3
3998
- cudaDeviceSynchronize 55.98% 2.635ms 55.98% 2.635ms 2.635ms 0.000us 0.00% 0.000us 0.000us 1
3999
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4000
- Self CPU time total: 4.706ms
4001
- Self CUDA time total: 2.847ms
4002
 
4003
 
4004
 
@@ -4008,19 +4008,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L384_bfloat16
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
- hf_kernels_flash_attn3 2.55% 127.134us 45.51% 2.268ms 2.268ms 0.000us 0.00% 3.920ms 3.920ms 1
4012
- FlashAttnFunc 1.80% 89.881us 42.96% 2.141ms 713.505us 0.000us 0.00% 3.920ms 1.307ms 3
4013
- _flash_attn3_1d39a44::fwd 0.97% 48.541us 41.15% 2.051ms 683.545us 2.930ms 100.00% 3.920ms 1.307ms 3
4014
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.932ms 100.05% 2.932ms 2.932ms 1
4015
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.930ms 100.00% 2.930ms 976.824us 3
4016
- Activity Buffer Request 35.08% 1.748ms 35.08% 1.748ms 1.748ms 989.112us 33.75% 989.112us 989.112us 1
4017
- aten::empty 0.54% 27.071us 0.54% 27.071us 4.512us 0.000us 0.00% 0.000us 0.000us 6
4018
- cudaFuncSetAttribute 0.11% 5.498us 0.11% 5.498us 1.833us 0.000us 0.00% 0.000us 0.000us 3
4019
- cudaLaunchKernel 4.45% 221.646us 4.45% 221.646us 73.882us 0.000us 0.00% 0.000us 0.000us 3
4020
- cudaDeviceSynchronize 54.49% 2.715ms 54.49% 2.715ms 2.715ms 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
- Self CPU time total: 4.983ms
4023
- Self CUDA time total: 2.930ms
4024
 
4025
 
4026
 
@@ -4030,19 +4030,19 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L448_bfloat16
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
- hf_kernels_flash_attn3 2.34% 128.034us 40.76% 2.227ms 2.227ms 0.000us 0.00% 4.607ms 4.607ms 1
4034
- FlashAttnFunc 1.67% 91.131us 38.42% 2.098ms 699.492us 0.000us 0.00% 4.607ms 1.536ms 3
4035
- _flash_attn3_1d39a44::fwd 0.87% 47.661us 36.75% 2.007ms 669.115us 3.452ms 100.00% 4.607ms 1.536ms 3
4036
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.453ms 100.05% 3.453ms 3.453ms 1
4037
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.452ms 100.00% 3.452ms 1.151ms 3
4038
- Activity Buffer Request 31.93% 1.744ms 31.93% 1.744ms 1.744ms 1.156ms 33.48% 1.156ms 1.156ms 1
4039
- aten::empty 0.52% 28.231us 0.52% 28.231us 4.705us 0.000us 0.00% 0.000us 0.000us 6
4040
- cudaFuncSetAttribute 0.10% 5.270us 0.10% 5.270us 1.757us 0.000us 0.00% 0.000us 0.000us 3
4041
- cudaLaunchKernel 3.33% 181.994us 3.33% 181.994us 60.665us 0.000us 0.00% 0.000us 0.000us 3
4042
- cudaDeviceSynchronize 59.24% 3.235ms 59.24% 3.235ms 3.235ms 0.000us 0.00% 0.000us 0.000us 1
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
- Self CPU time total: 5.462ms
4045
- Self CUDA time total: 3.452ms
4046
 
4047
 
4048
 
@@ -4052,40 +4052,39 @@ PROFILE TRACE: hf_kernels_flash_attn3 | cuda_attn_L512_bfloat16
4052
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4053
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
- hf_kernels_flash_attn3 2.42% 135.303us 41.95% 2.345ms 2.345ms 0.000us 0.00% 4.617ms 4.617ms 1
4056
- FlashAttnFunc 1.78% 99.322us 39.53% 2.210ms 736.513us 0.000us 0.00% 4.617ms 1.539ms 3
4057
- _flash_attn3_1d39a44::fwd 0.92% 51.382us 37.75% 2.110ms 703.406us 3.463ms 100.00% 4.617ms 1.539ms 3
4058
- hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.464ms 100.05% 3.464ms 3.464ms 1
4059
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.463ms 100.00% 3.463ms 1.154ms 3
4060
- Activity Buffer Request 33.12% 1.851ms 33.12% 1.851ms 1.851ms 1.155ms 33.34% 1.155ms 1.155ms 1
4061
- aten::empty 0.54% 30.101us 0.54% 30.101us 5.017us 0.000us 0.00% 0.000us 0.000us 6
4062
- cudaFuncSetAttribute 0.10% 5.430us 0.10% 5.430us 1.810us 0.000us 0.00% 0.000us 0.000us 3
4063
- cudaLaunchKernel 3.08% 171.953us 3.08% 171.953us 57.318us 0.000us 0.00% 0.000us 0.000us 3
4064
- cudaDeviceSynchronize 58.05% 3.245ms 58.05% 3.245ms 3.245ms 0.000us 0.00% 0.000us 0.000us 1
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
- Self CPU time total: 5.590ms
4067
- Self CUDA time total: 3.463ms
4068
 
4069
 
4070
  impl wl p50(ms) ok
4071
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True
4072
  hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4073
  hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
4074
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
4075
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4076
- hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
4077
  </pre></div>
4078
  <div class="uv-install-logs" id="uv-logs-benchmark">
4079
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4080
  <div class="uv-logs-content" style="display: none;">
4081
- Installed 14 packages in 12ms
4082
  </div>
4083
  </div>
4084
- <div class="cell-stderr">Fetching 5 files: 0%| | 0/5 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4085
-
4086
- Fetching 5 files: 20%|██ | 1/5 [00:00&lt;00:01, 3.45it/s]
4087
- Fetching 5 files: 40%|████ | 2/5 [00:01&lt;00:02, 1.11it/s]
4088
- Fetching 5 files: 100%|██████████| 5/5 [00:01&lt;00:00, 3.08it/s]</div>
4089
  <div class="cell-artifacts">
4090
  <h4>Artifacts:</h4>
4091
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 10.25s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3944
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3945
+ hf_kernels_flash_attn3 3.82% 178.994us 47.00% 2.205ms 2.205ms 0.000us 0.00% 3.693ms 3.693ms 1
3946
+ FlashAttnFunc 2.66% 124.811us 43.19% 2.026ms 675.274us 0.000us 0.00% 3.693ms 1.231ms 3
3947
+ _flash_attn3_1d39a44::fwd 1.59% 74.650us 40.52% 1.901ms 633.671us 2.792ms 100.00% 3.693ms 1.231ms 3
3948
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.794ms 100.05% 2.794ms 2.794ms 1
3949
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.792ms 100.00% 2.792ms 930.698us 3
3950
+ Activity Buffer Request 36.63% 1.718ms 36.63% 1.718ms 1.718ms 900.576us 32.25% 900.576us 900.576us 1
3951
+ aten::empty 0.99% 46.443us 0.99% 46.443us 7.741us 0.000us 0.00% 0.000us 0.000us 6
3952
+ cudaFuncSetAttribute 0.32% 14.861us 0.32% 14.861us 4.954us 0.000us 0.00% 0.000us 0.000us 3
3953
+ cudaLaunchKernel 1.00% 46.891us 1.00% 46.891us 15.630us 0.000us 0.00% 0.000us 0.000us 3
3954
+ cudaDeviceSynchronize 53.00% 2.486ms 53.00% 2.486ms 2.486ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
+ Self CPU time total: 4.691ms
3957
+ Self CUDA time total: 2.792ms
3958
 
3959
 
3960
 
 
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
+ hf_kernels_flash_attn3 2.16% 100.183us 44.03% 2.042ms 2.042ms 0.000us 0.00% 3.752ms 3.752ms 1
3968
+ FlashAttnFunc 1.96% 91.001us 41.87% 1.942ms 647.204us 0.000us 0.00% 3.752ms 1.251ms 3
3969
+ _flash_attn3_1d39a44::fwd 1.03% 47.561us 39.91% 1.851ms 616.870us 2.814ms 100.00% 3.752ms 1.251ms 3
3970
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.816ms 100.05% 2.816ms 2.816ms 1
3971
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.814ms 100.00% 2.814ms 938.079us 3
3972
+ Activity Buffer Request 37.49% 1.739ms 37.49% 1.739ms 1.739ms 937.887us 33.33% 937.887us 937.887us 1
3973
+ aten::empty 0.58% 26.762us 0.58% 26.762us 4.460us 0.000us 0.00% 0.000us 0.000us 6
3974
+ cudaFuncSetAttribute 0.11% 5.220us 0.11% 5.220us 1.740us 0.000us 0.00% 0.000us 0.000us 3
3975
+ cudaLaunchKernel 0.70% 32.410us 0.70% 32.410us 10.803us 0.000us 0.00% 0.000us 0.000us 3
3976
+ cudaDeviceSynchronize 55.97% 2.595ms 55.97% 2.595ms 2.595ms 0.000us 0.00% 0.000us 0.000us 1
3977
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3978
+ Self CPU time total: 4.637ms
3979
+ Self CUDA time total: 2.814ms
3980
 
3981
 
3982
 
 
3986
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3987
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
+ hf_kernels_flash_attn3 2.13% 100.213us 42.34% 1.994ms 1.994ms 0.000us 0.00% 3.924ms 3.924ms 1
3990
+ FlashAttnFunc 1.82% 85.940us 40.21% 1.894ms 631.253us 0.000us 0.00% 3.924ms 1.308ms 3
3991
+ _flash_attn3_1d39a44::fwd 1.03% 48.325us 38.38% 1.808ms 602.607us 2.927ms 100.00% 3.924ms 1.308ms 3
3992
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.929ms 100.05% 2.929ms 2.929ms 1
3993
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.927ms 100.00% 2.927ms 975.684us 3
3994
+ Activity Buffer Request 36.02% 1.697ms 36.02% 1.697ms 1.697ms 997.252us 34.07% 997.252us 997.252us 1
3995
+ aten::empty 0.56% 26.419us 0.56% 26.419us 4.403us 0.000us 0.00% 0.000us 0.000us 6
3996
+ cudaFuncSetAttribute 0.12% 5.490us 0.12% 5.490us 1.830us 0.000us 0.00% 0.000us 0.000us 3
3997
+ cudaLaunchKernel 0.66% 31.020us 0.66% 31.020us 10.340us 0.000us 0.00% 0.000us 0.000us 3
3998
+ cudaDeviceSynchronize 57.66% 2.716ms 57.66% 2.716ms 2.716ms 0.000us 0.00% 0.000us 0.000us 1
3999
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4000
+ Self CPU time total: 4.710ms
4001
+ Self CUDA time total: 2.927ms
4002
 
4003
 
4004
 
 
4008
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4009
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4010
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4011
+ hf_kernels_flash_attn3 2.19% 98.471us 39.26% 1.764ms 1.764ms 0.000us 0.00% 3.945ms 3.945ms 1
4012
+ FlashAttnFunc 1.97% 88.443us 37.06% 1.666ms 555.216us 0.000us 0.00% 3.945ms 1.315ms 3
4013
+ _flash_attn3_1d39a44::fwd 1.11% 49.881us 35.10% 1.577ms 525.735us 2.942ms 100.00% 3.945ms 1.315ms 3
4014
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 2.943ms 100.05% 2.943ms 2.943ms 1
4015
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.942ms 100.00% 2.942ms 980.556us 3
4016
+ Activity Buffer Request 27.81% 1.250ms 27.81% 1.250ms 1.250ms 1.003ms 34.09% 1.003ms 1.003ms 1
4017
+ aten::empty 0.60% 26.780us 0.60% 26.780us 4.463us 0.000us 0.00% 0.000us 0.000us 6
4018
+ cudaFuncSetAttribute 0.11% 5.141us 0.11% 5.141us 1.714us 0.000us 0.00% 0.000us 0.000us 3
4019
+ cudaLaunchKernel 5.46% 245.555us 5.46% 245.555us 81.852us 0.000us 0.00% 0.000us 0.000us 3
4020
+ cudaDeviceSynchronize 60.74% 2.730ms 60.74% 2.730ms 2.730ms 0.000us 0.00% 0.000us 0.000us 1
4021
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4022
+ Self CPU time total: 4.494ms
4023
+ Self CUDA time total: 2.942ms
4024
 
4025
 
4026
 
 
4030
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4031
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4032
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4033
+ hf_kernels_flash_attn3 1.83% 100.852us 39.93% 2.202ms 2.202ms 0.000us 0.00% 4.714ms 4.714ms 1
4034
+ FlashAttnFunc 1.62% 89.332us 38.10% 2.101ms 700.422us 0.000us 0.00% 4.714ms 1.571ms 3
4035
+ _flash_attn3_1d39a44::fwd 0.86% 47.622us 36.48% 2.012ms 670.645us 3.530ms 100.00% 4.714ms 1.571ms 3
4036
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.531ms 100.04% 3.531ms 3.531ms 1
4037
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.530ms 100.00% 3.530ms 1.177ms 3
4038
+ Activity Buffer Request 31.48% 1.736ms 31.48% 1.736ms 1.736ms 1.184ms 33.56% 1.184ms 1.184ms 1
4039
+ aten::empty 0.51% 27.890us 0.51% 27.890us 4.648us 0.000us 0.00% 0.000us 0.000us 6
4040
+ cudaFuncSetAttribute 0.09% 5.140us 0.09% 5.140us 1.713us 0.000us 0.00% 0.000us 0.000us 3
4041
+ cudaLaunchKernel 3.53% 194.875us 3.53% 194.875us 64.958us 0.000us 0.00% 0.000us 0.000us 3
4042
+ cudaDeviceSynchronize 60.07% 3.313ms 60.07% 3.313ms 3.313ms 0.000us 0.00% 0.000us 0.000us 1
4043
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4044
+ Self CPU time total: 5.515ms
4045
+ Self CUDA time total: 3.530ms
4046
 
4047
 
4048
 
 
4052
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4053
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4054
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4055
+ hf_kernels_flash_attn3 1.85% 100.143us 39.23% 2.129ms 2.129ms 0.000us 0.00% 4.688ms 4.688ms 1
4056
+ FlashAttnFunc 1.59% 86.190us 37.39% 2.029ms 676.324us 0.000us 0.00% 4.688ms 1.563ms 3
4057
+ _flash_attn3_1d39a44::fwd 0.90% 48.962us 35.80% 1.943ms 647.594us 3.510ms 100.00% 4.688ms 1.563ms 3
4058
+ hf_kernels_flash_attn3 0.00% 0.000us 0.00% 0.000us 0.000us 3.512ms 100.05% 3.512ms 3.512ms 1
4059
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.510ms 100.00% 3.510ms 1.170ms 3
4060
+ Activity Buffer Request 31.16% 1.691ms 31.16% 1.691ms 1.691ms 1.178ms 33.55% 1.178ms 1.178ms 1
4061
+ aten::empty 0.49% 26.491us 0.49% 26.491us 4.415us 0.000us 0.00% 0.000us 0.000us 6
4062
+ cudaFuncSetAttribute 0.09% 5.060us 0.09% 5.060us 1.687us 0.000us 0.00% 0.000us 0.000us 3
4063
+ cudaLaunchKernel 3.15% 171.134us 3.15% 171.134us 57.045us 0.000us 0.00% 0.000us 0.000us 3
4064
+ cudaDeviceSynchronize 60.77% 3.297ms 60.77% 3.297ms 3.297ms 0.000us 0.00% 0.000us 0.000us 1
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
+ Self CPU time total: 5.427ms
4067
+ Self CUDA time total: 3.510ms
4068
 
4069
 
4070
  impl wl p50(ms) ok
4071
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
4072
  hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4073
  hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
4074
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4075
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True
4076
+ hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.22 True
4077
  </pre></div>
4078
  <div class="uv-install-logs" id="uv-logs-benchmark">
4079
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4080
  <div class="uv-logs-content" style="display: none;">
4081
+ Installed 51 packages in 298ms
4082
  </div>
4083
  </div>
4084
+ <div class="cell-stderr">Fetching 5 files: 0%| | 0/5 [00:00&lt;?, ?it/s]
4085
+ Fetching 5 files: 20%|██ | 1/5 [00:00&lt;00:00, 9.30it/s]
4086
+ Fetching 5 files: 40%|████ | 2/5 [00:01&lt;00:02, 1.12it/s]
4087
+ Fetching 5 files: 100%|██████████| 5/5 [00:01&lt;00:00, 3.22it/s]</div>
 
4088
  <div class="cell-artifacts">
4089
  <h4>Artifacts:</h4>
4090
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/mem_efficient_attention.html CHANGED
@@ -3886,9 +3886,9 @@ body[data-tool="eraser"] .main-content {
3886
  <span class="collapse-indicators">
3887
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
- <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 8.14s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3941,28 +3941,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L128_bfloat16
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.462ms 101.52% 5.462ms 5.462ms 1
3945
- torch_mem_eff 4.78% 351.785us 36.36% 2.675ms 2.675ms 0.000us 0.00% 5.434ms 5.434ms 1
3946
- aten::scaled_dot_product_attention 0.44% 32.361us 3.09% 227.216us 75.739us 0.000us 0.00% 4.760ms 1.587ms 3
3947
- aten::_scaled_dot_product_efficient_attention 0.32% 23.392us 2.65% 194.855us 64.952us 0.000us 0.00% 4.760ms 1.587ms 3
3948
- aten::_efficient_attention_forward 0.47% 34.731us 1.98% 145.602us 48.534us 4.760ms 88.47% 4.760ms 1.587ms 3
3949
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.760ms 88.47% 4.760ms 1.587ms 3
3950
- aten::contiguous 0.14% 10.161us 27.51% 2.023ms 224.817us 0.000us 0.00% 673.947us 74.883us 9
3951
- aten::clone 0.40% 29.063us 27.37% 2.013ms 223.688us 0.000us 0.00% 673.947us 74.883us 9
3952
- aten::copy_ 1.06% 77.620us 25.90% 1.905ms 211.680us 620.444us 11.53% 673.947us 74.883us 9
3953
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 620.444us 11.53% 620.444us 68.938us 9
3954
- Activity Buffer Request 23.68% 1.742ms 23.68% 1.742ms 1.742ms 53.503us 0.99% 53.503us 53.503us 1
3955
- aten::transpose 0.99% 72.964us 1.33% 98.194us 4.091us 0.000us 0.00% 0.000us 0.000us 24
3956
- aten::as_strided 0.34% 25.230us 0.34% 25.230us 1.051us 0.000us 0.00% 0.000us 0.000us 24
3957
- aten::empty_like 0.25% 18.168us 1.07% 79.009us 8.779us 0.000us 0.00% 0.000us 0.000us 9
3958
- aten::empty 1.28% 94.381us 1.28% 94.381us 4.494us 0.000us 0.00% 0.000us 0.000us 21
3959
- cudaLaunchKernel 1.49% 109.573us 1.49% 109.573us 9.131us 0.000us 0.00% 0.000us 0.000us 12
3960
- cudaStreamIsCapturing 0.05% 3.660us 0.05% 3.660us 1.220us 0.000us 0.00% 0.000us 0.000us 3
3961
- cudaFuncSetAttribute 0.67% 49.491us 0.67% 49.491us 16.497us 0.000us 0.00% 0.000us 0.000us 3
3962
- cudaDeviceSynchronize 63.64% 4.681ms 63.64% 4.681ms 4.681ms 0.000us 0.00% 0.000us 0.000us 1
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
- Self CPU time total: 7.356ms
3965
- Self CUDA time total: 5.380ms
3966
 
3967
 
3968
 
@@ -3972,28 +3972,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L256_bfloat16
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
- torch_mem_eff 2.99% 227.637us 31.17% 2.369ms 2.369ms 0.000us 0.00% 5.835ms 5.835ms 1
3976
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.790ms 100.14% 5.790ms 5.790ms 1
3977
- aten::scaled_dot_product_attention 0.23% 17.721us 1.87% 142.143us 47.381us 0.000us 0.00% 5.146ms 1.715ms 3
3978
- aten::_scaled_dot_product_efficient_attention 0.25% 18.819us 1.64% 124.422us 41.474us 0.000us 0.00% 5.146ms 1.715ms 3
3979
- aten::_efficient_attention_forward 0.37% 28.141us 1.08% 82.262us 27.421us 5.146ms 89.01% 5.146ms 1.715ms 3
3980
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.146ms 89.01% 5.146ms 1.715ms 3
3981
- aten::contiguous 0.09% 6.739us 25.75% 1.957ms 217.483us 0.000us 0.00% 689.503us 76.611us 9
3982
- aten::clone 0.27% 20.691us 25.66% 1.951ms 216.734us 0.000us 0.00% 689.503us 76.611us 9
3983
- aten::copy_ 0.83% 62.851us 24.72% 1.879ms 208.808us 635.680us 10.99% 689.503us 76.611us 9
3984
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 635.680us 10.99% 635.680us 70.631us 9
3985
- Activity Buffer Request 23.06% 1.753ms 23.06% 1.753ms 1.753ms 53.823us 0.93% 53.823us 53.823us 1
3986
- aten::transpose 0.63% 47.890us 0.86% 65.431us 2.726us 0.000us 0.00% 0.000us 0.000us 24
3987
- aten::as_strided 0.23% 17.541us 0.23% 17.541us 0.731us 0.000us 0.00% 0.000us 0.000us 24
3988
- aten::empty_like 0.15% 11.310us 0.67% 50.641us 5.627us 0.000us 0.00% 0.000us 0.000us 9
3989
- aten::empty 0.87% 66.232us 0.87% 66.232us 3.154us 0.000us 0.00% 0.000us 0.000us 21
3990
- cudaLaunchKernel 1.12% 85.492us 1.12% 85.492us 7.124us 0.000us 0.00% 0.000us 0.000us 12
3991
- cudaStreamIsCapturing 0.03% 2.460us 0.03% 2.460us 0.820us 0.000us 0.00% 0.000us 0.000us 3
3992
- cudaFuncSetAttribute 0.04% 3.070us 0.04% 3.070us 1.023us 0.000us 0.00% 0.000us 0.000us 3
3993
- cudaDeviceSynchronize 68.83% 5.232ms 68.83% 5.232ms 5.232ms 0.000us 0.00% 0.000us 0.000us 1
3994
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3995
- Self CPU time total: 7.601ms
3996
- Self CUDA time total: 5.782ms
3997
 
3998
 
3999
 
@@ -4003,28 +4003,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L320_bfloat16
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
- torch_mem_eff 2.88% 222.044us 30.17% 2.327ms 2.327ms 0.000us 0.00% 5.986ms 5.986ms 1
4007
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.939ms 100.13% 5.939ms 5.939ms 1
4008
- aten::scaled_dot_product_attention 0.24% 18.710us 1.85% 142.303us 47.434us 0.000us 0.00% 5.284ms 1.761ms 3
4009
- aten::_scaled_dot_product_efficient_attention 0.25% 19.190us 1.60% 123.593us 41.198us 0.000us 0.00% 5.284ms 1.761ms 3
4010
- aten::_efficient_attention_forward 0.36% 27.947us 1.05% 81.281us 27.094us 5.284ms 89.10% 5.284ms 1.761ms 3
4011
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.284ms 89.10% 5.284ms 1.761ms 3
4012
- aten::contiguous 0.09% 7.300us 24.90% 1.920ms 213.350us 0.000us 0.00% 702.238us 78.026us 9
4013
- aten::clone 0.28% 21.930us 24.80% 1.913ms 212.539us 0.000us 0.00% 702.238us 78.026us 9
4014
- aten::copy_ 0.79% 60.872us 23.86% 1.840ms 204.449us 646.526us 10.90% 702.238us 78.026us 9
4015
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 646.526us 10.90% 646.526us 71.836us 9
4016
- Activity Buffer Request 22.23% 1.715ms 22.23% 1.715ms 1.715ms 55.712us 0.94% 55.712us 55.712us 1
4017
- aten::transpose 0.63% 48.814us 0.85% 65.893us 2.746us 0.000us 0.00% 0.000us 0.000us 24
4018
- aten::as_strided 0.22% 17.079us 0.22% 17.079us 0.712us 0.000us 0.00% 0.000us 0.000us 24
4019
- aten::empty_like 0.15% 11.801us 0.66% 50.882us 5.654us 0.000us 0.00% 0.000us 0.000us 9
4020
- aten::empty 0.85% 65.644us 0.85% 65.644us 3.126us 0.000us 0.00% 0.000us 0.000us 21
4021
- cudaLaunchKernel 1.11% 85.622us 1.11% 85.622us 7.135us 0.000us 0.00% 0.000us 0.000us 12
4022
- cudaStreamIsCapturing 0.03% 2.511us 0.03% 2.511us 0.837us 0.000us 0.00% 0.000us 0.000us 3
4023
- cudaFuncSetAttribute 0.04% 3.110us 0.04% 3.110us 1.037us 0.000us 0.00% 0.000us 0.000us 3
4024
- cudaDeviceSynchronize 69.83% 5.385ms 69.83% 5.385ms 5.385ms 0.000us 0.00% 0.000us 0.000us 1
4025
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
- Self CPU time total: 7.713ms
4027
- Self CUDA time total: 5.931ms
4028
 
4029
 
4030
 
@@ -4034,28 +4034,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L384_bfloat16
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
- torch_mem_eff 3.05% 248.737us 32.15% 2.620ms 2.620ms 0.000us 0.00% 6.167ms 6.167ms 1
4038
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.117ms 100.13% 6.117ms 6.117ms 1
4039
- aten::scaled_dot_product_attention 0.24% 19.380us 1.81% 147.173us 49.058us 0.000us 0.00% 5.450ms 1.817ms 3
4040
- aten::_scaled_dot_product_efficient_attention 0.23% 19.059us 1.57% 127.793us 42.598us 0.000us 0.00% 5.450ms 1.817ms 3
4041
- aten::_efficient_attention_forward 0.34% 28.111us 1.04% 84.373us 28.124us 5.450ms 89.21% 5.450ms 1.817ms 3
4042
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.450ms 89.21% 5.450ms 1.817ms 3
4043
- aten::contiguous 0.09% 7.070us 26.79% 2.183ms 242.545us 0.000us 0.00% 717.472us 79.719us 9
4044
- aten::clone 0.26% 21.211us 26.70% 2.176ms 241.760us 0.000us 0.00% 717.472us 79.719us 9
4045
- aten::copy_ 0.77% 62.427us 25.76% 2.100ms 233.287us 658.976us 10.79% 717.472us 79.719us 9
4046
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 658.976us 10.79% 658.976us 73.220us 9
4047
- Activity Buffer Request 21.68% 1.767ms 21.68% 1.767ms 1.767ms 58.496us 0.96% 58.496us 58.496us 1
4048
- aten::transpose 0.59% 47.765us 0.81% 65.883us 2.745us 0.000us 0.00% 0.000us 0.000us 24
4049
- aten::as_strided 0.22% 18.118us 0.22% 18.118us 0.755us 0.000us 0.00% 0.000us 0.000us 24
4050
- aten::empty_like 0.14% 11.420us 0.68% 55.041us 6.116us 0.000us 0.00% 0.000us 0.000us 9
4051
- aten::empty 0.87% 71.281us 0.87% 71.281us 3.394us 0.000us 0.00% 0.000us 0.000us 21
4052
- cudaLaunchKernel 3.59% 292.889us 3.59% 292.889us 24.407us 0.000us 0.00% 0.000us 0.000us 12
4053
- cudaStreamIsCapturing 0.03% 2.781us 0.03% 2.781us 0.927us 0.000us 0.00% 0.000us 0.000us 3
4054
- cudaFuncSetAttribute 0.04% 3.020us 0.04% 3.020us 1.007us 0.000us 0.00% 0.000us 0.000us 3
4055
- cudaDeviceSynchronize 67.85% 5.529ms 67.85% 5.529ms 5.529ms 0.000us 0.00% 0.000us 0.000us 1
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
- Self CPU time total: 8.150ms
4058
- Self CUDA time total: 6.109ms
4059
 
4060
 
4061
 
@@ -4065,28 +4065,28 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L448_bfloat16
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
- torch_mem_eff 2.74% 222.904us 29.02% 2.363ms 2.363ms 0.000us 0.00% 6.392ms 6.392ms 1
4069
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.341ms 100.13% 6.341ms 6.341ms 1
4070
- aten::scaled_dot_product_attention 0.23% 18.463us 1.76% 143.054us 47.685us 0.000us 0.00% 5.664ms 1.888ms 3
4071
- aten::_scaled_dot_product_efficient_attention 0.23% 18.699us 1.53% 124.591us 41.530us 0.000us 0.00% 5.664ms 1.888ms 3
4072
- aten::_efficient_attention_forward 0.35% 28.650us 1.01% 82.071us 27.357us 5.664ms 89.43% 5.664ms 1.888ms 3
4073
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.664ms 89.43% 5.664ms 1.888ms 3
4074
- aten::contiguous 0.09% 7.480us 24.00% 1.954ms 217.122us 0.000us 0.00% 727.838us 80.871us 9
4075
- aten::clone 0.26% 21.231us 23.90% 1.947ms 216.290us 0.000us 0.00% 727.838us 80.871us 9
4076
- aten::copy_ 0.78% 63.523us 23.01% 1.874ms 208.176us 669.182us 10.57% 727.838us 80.871us 9
4077
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 669.182us 10.57% 669.182us 74.354us 9
4078
- Activity Buffer Request 19.19% 1.562ms 19.19% 1.562ms 1.562ms 58.656us 0.93% 58.656us 58.656us 1
4079
- aten::transpose 0.60% 48.754us 0.82% 66.672us 2.778us 0.000us 0.00% 0.000us 0.000us 24
4080
- aten::as_strided 0.22% 17.918us 0.22% 17.918us 0.747us 0.000us 0.00% 0.000us 0.000us 24
4081
- aten::empty_like 0.14% 11.269us 0.64% 51.800us 5.756us 0.000us 0.00% 0.000us 0.000us 9
4082
- aten::empty 0.81% 66.291us 0.81% 66.291us 3.157us 0.000us 0.00% 0.000us 0.000us 21
4083
- cudaLaunchKernel 3.31% 269.756us 3.31% 269.756us 22.480us 0.000us 0.00% 0.000us 0.000us 12
4084
- cudaStreamIsCapturing 0.03% 2.590us 0.03% 2.590us 0.863us 0.000us 0.00% 0.000us 0.000us 3
4085
- cudaFuncSetAttribute 0.04% 2.940us 0.04% 2.940us 0.980us 0.000us 0.00% 0.000us 0.000us 3
4086
- cudaDeviceSynchronize 70.98% 5.781ms 70.98% 5.781ms 5.781ms 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
- Self CPU time total: 8.144ms
4089
- Self CUDA time total: 6.333ms
4090
 
4091
 
4092
 
@@ -4096,44 +4096,38 @@ PROFILE TRACE: torch_mem_eff | cuda_attn_L512_bfloat16
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
- torch_mem_eff 2.91% 254.056us 31.19% 2.722ms 2.722ms 0.000us 0.00% 6.645ms 6.645ms 1
4100
- torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.592ms 100.12% 6.592ms 6.592ms 1
4101
- aten::scaled_dot_product_attention 0.23% 20.440us 1.69% 147.533us 49.178us 0.000us 0.00% 5.910ms 1.970ms 3
4102
- aten::_scaled_dot_product_efficient_attention 0.22% 19.250us 1.46% 127.093us 42.364us 0.000us 0.00% 5.910ms 1.970ms 3
4103
- aten::_efficient_attention_forward 0.33% 28.899us 0.98% 85.242us 28.414us 5.910ms 89.76% 5.910ms 1.970ms 3
4104
- fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.910ms 89.76% 5.910ms 1.970ms 3
4105
- aten::contiguous 0.08% 7.268us 26.04% 2.272ms 252.404us 0.000us 0.00% 734.815us 81.646us 9
4106
- aten::clone 0.28% 24.054us 25.95% 2.264ms 251.596us 0.000us 0.00% 734.815us 81.646us 9
4107
- aten::copy_ 0.77% 66.891us 25.04% 2.185ms 242.745us 674.239us 10.24% 734.815us 81.646us 9
4108
- void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 674.239us 10.24% 674.239us 74.915us 9
4109
- Activity Buffer Request 20.22% 1.764ms 20.22% 1.764ms 1.764ms 60.576us 0.92% 60.576us 60.576us 1
4110
- aten::transpose 0.62% 53.860us 0.81% 70.972us 2.957us 0.000us 0.00% 0.000us 0.000us 24
4111
- aten::as_strided 0.20% 17.112us 0.20% 17.112us 0.713us 0.000us 0.00% 0.000us 0.000us 24
4112
- aten::empty_like 0.15% 12.910us 0.64% 55.601us 6.178us 0.000us 0.00% 0.000us 0.000us 9
4113
- aten::empty 0.82% 71.503us 0.82% 71.503us 3.405us 0.000us 0.00% 0.000us 0.000us 21
4114
- cudaLaunchKernel 4.30% 375.338us 4.30% 375.338us 31.278us 0.000us 0.00% 0.000us 0.000us 12
4115
- cudaStreamIsCapturing 0.03% 2.571us 0.03% 2.571us 0.857us 0.000us 0.00% 0.000us 0.000us 3
4116
- cudaFuncSetAttribute 0.03% 3.000us 0.03% 3.000us 1.000us 0.000us 0.00% 0.000us 0.000us 3
4117
- cudaDeviceSynchronize 68.81% 6.003ms 68.81% 6.003ms 6.003ms 0.000us 0.00% 0.000us 0.000us 1
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
- Self CPU time total: 8.725ms
4120
- Self CUDA time total: 6.584ms
4121
 
4122
 
4123
  impl wl p50(ms) ok
4124
- torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4125
- torch_mem_eff cuda_attn_L256_bfloat16 1.93 True
4126
- torch_mem_eff cuda_attn_L320_bfloat16 1.95 True
4127
- torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
4128
- torch_mem_eff cuda_attn_L448_bfloat16 2.08 True
4129
- torch_mem_eff cuda_attn_L512_bfloat16 2.17 True
4130
  </pre></div>
4131
- <div class="uv-install-logs" id="uv-logs-benchmark">
4132
- <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4133
- <div class="uv-logs-content" style="display: none;">
4134
- Installed 37 packages in 340ms
4135
- </div>
4136
- </div>
4137
  <div class="cell-artifacts">
4138
  <h4>Artifacts:</h4>
4139
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
 
3886
  <span class="collapse-indicators">
3887
  <span onclick="toggleCode('benchmark')" style="cursor: pointer;">▼ code</span>
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
+ <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 4.12s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3941
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3942
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3943
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3944
+ torch_mem_eff 4.25% 311.827us 34.94% 2.563ms 2.563ms 0.000us 0.00% 5.488ms 5.488ms 1
3945
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.465ms 100.56% 5.465ms 5.465ms 1
3946
+ aten::scaled_dot_product_attention 0.42% 30.830us 2.38% 174.593us 58.198us 0.000us 0.00% 4.817ms 1.606ms 3
3947
+ aten::_scaled_dot_product_efficient_attention 0.32% 23.429us 1.96% 143.763us 47.921us 0.000us 0.00% 4.817ms 1.606ms 3
3948
+ aten::_efficient_attention_forward 0.46% 33.832us 1.33% 97.922us 32.641us 4.817ms 88.64% 4.817ms 1.606ms 3
3949
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 4.817ms 88.64% 4.817ms 1.606ms 3
3950
+ aten::contiguous 0.14% 10.180us 27.43% 2.012ms 223.532us 0.000us 0.00% 670.850us 74.539us 9
3951
+ aten::clone 0.43% 31.262us 27.29% 2.002ms 222.401us 0.000us 0.00% 670.850us 74.539us 9
3952
+ aten::copy_ 1.01% 74.042us 25.85% 1.896ms 210.687us 617.346us 11.36% 670.850us 74.539us 9
3953
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 617.346us 11.36% 617.346us 68.594us 9
3954
+ Activity Buffer Request 23.70% 1.738ms 23.70% 1.738ms 1.738ms 53.504us 0.98% 53.504us 53.504us 1
3955
+ aten::transpose 0.89% 65.502us 1.19% 87.343us 3.639us 0.000us 0.00% 0.000us 0.000us 24
3956
+ aten::as_strided 0.30% 21.841us 0.30% 21.841us 0.910us 0.000us 0.00% 0.000us 0.000us 24
3957
+ aten::empty_like 0.21% 15.520us 1.01% 74.161us 8.240us 0.000us 0.00% 0.000us 0.000us 9
3958
+ aten::empty 1.17% 85.772us 1.17% 85.772us 4.084us 0.000us 0.00% 0.000us 0.000us 21
3959
+ cudaLaunchKernel 1.48% 108.273us 1.48% 108.273us 9.023us 0.000us 0.00% 0.000us 0.000us 12
3960
+ cudaStreamIsCapturing 0.05% 3.869us 0.05% 3.869us 1.290us 0.000us 0.00% 0.000us 0.000us 3
3961
+ cudaFuncSetAttribute 0.12% 8.830us 0.12% 8.830us 2.943us 0.000us 0.00% 0.000us 0.000us 3
3962
+ cudaDeviceSynchronize 65.06% 4.772ms 65.06% 4.772ms 4.772ms 0.000us 0.00% 0.000us 0.000us 1
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
+ Self CPU time total: 7.335ms
3965
+ Self CUDA time total: 5.434ms
3966
 
3967
 
3968
 
 
3972
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3973
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
+ torch_mem_eff 3.26% 247.835us 31.36% 2.385ms 2.385ms 0.000us 0.00% 5.867ms 5.867ms 1
3976
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 5.821ms 100.14% 5.821ms 5.821ms 1
3977
+ aten::scaled_dot_product_attention 0.22% 16.881us 1.81% 137.424us 45.808us 0.000us 0.00% 5.175ms 1.725ms 3
3978
+ aten::_scaled_dot_product_efficient_attention 0.25% 18.660us 1.59% 120.543us 40.181us 0.000us 0.00% 5.175ms 1.725ms 3
3979
+ aten::_efficient_attention_forward 0.35% 26.843us 1.04% 78.951us 26.317us 5.175ms 89.03% 5.175ms 1.725ms 3
3980
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.175ms 89.03% 5.175ms 1.725ms 3
3981
+ aten::contiguous 0.09% 7.172us 25.72% 1.955ms 217.264us 0.000us 0.00% 691.584us 76.843us 9
3982
+ aten::clone 0.31% 23.260us 25.62% 1.948ms 216.467us 0.000us 0.00% 691.584us 76.843us 9
3983
+ aten::copy_ 0.84% 64.031us 24.18% 1.839ms 204.318us 637.408us 10.97% 691.584us 76.843us 9
3984
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 637.408us 10.97% 637.408us 70.823us 9
3985
+ Activity Buffer Request 22.42% 1.705ms 22.42% 1.705ms 1.705ms 54.176us 0.93% 54.176us 54.176us 1
3986
+ aten::transpose 0.64% 49.041us 0.88% 66.991us 2.791us 0.000us 0.00% 0.000us 0.000us 24
3987
+ aten::as_strided 0.24% 17.950us 0.24% 17.950us 0.748us 0.000us 0.00% 0.000us 0.000us 24
3988
+ aten::empty_like 0.17% 12.602us 1.13% 86.083us 9.565us 0.000us 0.00% 0.000us 0.000us 9
3989
+ aten::empty 1.29% 98.070us 1.29% 98.070us 4.670us 0.000us 0.00% 0.000us 0.000us 21
3990
+ cudaLaunchKernel 1.22% 92.470us 1.22% 92.470us 7.706us 0.000us 0.00% 0.000us 0.000us 12
3991
+ cudaStreamIsCapturing 0.04% 2.690us 0.04% 2.690us 0.897us 0.000us 0.00% 0.000us 0.000us 3
3992
+ cudaFuncSetAttribute 0.04% 2.679us 0.04% 2.679us 0.893us 0.000us 0.00% 0.000us 0.000us 3
3993
+ cudaDeviceSynchronize 68.64% 5.219ms 68.64% 5.219ms 5.219ms 0.000us 0.00% 0.000us 0.000us 1
3994
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3995
+ Self CPU time total: 7.603ms
3996
+ Self CUDA time total: 5.812ms
3997
 
3998
 
3999
 
 
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
+ torch_mem_eff 3.07% 241.867us 30.18% 2.381ms 2.381ms 0.000us 0.00% 6.114ms 6.114ms 1
4007
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.067ms 100.14% 6.067ms 6.067ms 1
4008
+ aten::scaled_dot_product_attention 0.22% 17.069us 1.75% 137.963us 45.988us 0.000us 0.00% 5.411ms 1.804ms 3
4009
+ aten::_scaled_dot_product_efficient_attention 0.24% 18.570us 1.53% 120.894us 40.298us 0.000us 0.00% 5.411ms 1.804ms 3
4010
+ aten::_efficient_attention_forward 0.35% 27.663us 1.02% 80.252us 26.751us 5.411ms 89.32% 5.411ms 1.804ms 3
4011
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.411ms 89.32% 5.411ms 1.804ms 3
4012
+ aten::contiguous 0.11% 8.338us 24.80% 1.957ms 217.397us 0.000us 0.00% 703.296us 78.144us 9
4013
+ aten::clone 0.29% 22.493us 24.69% 1.948ms 216.470us 0.000us 0.00% 703.296us 78.144us 9
4014
+ aten::copy_ 0.83% 65.242us 23.73% 1.872ms 208.052us 647.296us 10.68% 703.296us 78.144us 9
4015
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 647.296us 10.68% 647.296us 71.922us 9
4016
+ Activity Buffer Request 22.06% 1.740ms 22.06% 1.740ms 1.740ms 56.000us 0.92% 56.000us 56.000us 1
4017
+ aten::transpose 0.64% 50.792us 0.85% 67.072us 2.795us 0.000us 0.00% 0.000us 0.000us 24
4018
+ aten::as_strided 0.21% 16.280us 0.21% 16.280us 0.678us 0.000us 0.00% 0.000us 0.000us 24
4019
+ aten::empty_like 0.15% 11.839us 0.68% 53.270us 5.919us 0.000us 0.00% 0.000us 0.000us 9
4020
+ aten::empty 0.84% 66.171us 0.84% 66.171us 3.151us 0.000us 0.00% 0.000us 0.000us 21
4021
+ cudaLaunchKernel 1.13% 89.500us 1.13% 89.500us 7.458us 0.000us 0.00% 0.000us 0.000us 12
4022
+ cudaStreamIsCapturing 0.03% 2.430us 0.03% 2.430us 0.810us 0.000us 0.00% 0.000us 0.000us 3
4023
+ cudaFuncSetAttribute 0.03% 2.650us 0.03% 2.650us 0.883us 0.000us 0.00% 0.000us 0.000us 3
4024
+ cudaDeviceSynchronize 69.82% 5.508ms 69.82% 5.508ms 5.508ms 0.000us 0.00% 0.000us 0.000us 1
4025
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4026
+ Self CPU time total: 7.890ms
4027
+ Self CUDA time total: 6.058ms
4028
 
4029
 
4030
 
 
4034
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4035
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
+ torch_mem_eff 3.00% 245.113us 31.96% 2.610ms 2.610ms 0.000us 0.00% 6.162ms 6.162ms 1
4038
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.113ms 100.14% 6.113ms 6.113ms 1
4039
+ aten::scaled_dot_product_attention 0.20% 16.700us 1.71% 139.473us 46.491us 0.000us 0.00% 5.450ms 1.817ms 3
4040
+ aten::_scaled_dot_product_efficient_attention 0.23% 18.811us 1.50% 122.773us 40.924us 0.000us 0.00% 5.450ms 1.817ms 3
4041
+ aten::_efficient_attention_forward 0.34% 27.691us 0.98% 80.171us 26.724us 5.450ms 89.27% 5.450ms 1.817ms 3
4042
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.450ms 89.27% 5.450ms 1.817ms 3
4043
+ aten::contiguous 0.09% 7.732us 26.74% 2.184ms 242.673us 0.000us 0.00% 712.645us 79.183us 9
4044
+ aten::clone 0.28% 22.711us 26.65% 2.176ms 241.814us 0.000us 0.00% 712.645us 79.183us 9
4045
+ aten::copy_ 0.78% 63.988us 25.72% 2.101ms 233.430us 654.820us 10.73% 712.645us 79.183us 9
4046
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 654.820us 10.73% 654.820us 72.758us 9
4047
+ Activity Buffer Request 21.86% 1.785ms 21.86% 1.785ms 1.785ms 57.825us 0.95% 57.825us 57.825us 1
4048
+ aten::transpose 0.59% 47.982us 0.80% 65.243us 2.718us 0.000us 0.00% 0.000us 0.000us 24
4049
+ aten::as_strided 0.21% 17.261us 0.21% 17.261us 0.719us 0.000us 0.00% 0.000us 0.000us 24
4050
+ aten::empty_like 0.14% 11.742us 0.65% 52.742us 5.860us 0.000us 0.00% 0.000us 0.000us 9
4051
+ aten::empty 0.82% 66.990us 0.82% 66.990us 3.190us 0.000us 0.00% 0.000us 0.000us 21
4052
+ cudaLaunchKernel 3.34% 272.558us 3.34% 272.558us 22.713us 0.000us 0.00% 0.000us 0.000us 12
4053
+ cudaStreamIsCapturing 0.03% 2.519us 0.03% 2.519us 0.840us 0.000us 0.00% 0.000us 0.000us 3
4054
+ cudaFuncSetAttribute 0.03% 2.830us 0.03% 2.830us 0.943us 0.000us 0.00% 0.000us 0.000us 3
4055
+ cudaDeviceSynchronize 68.04% 5.557ms 68.04% 5.557ms 5.557ms 0.000us 0.00% 0.000us 0.000us 1
4056
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4057
+ Self CPU time total: 8.167ms
4058
+ Self CUDA time total: 6.105ms
4059
 
4060
 
4061
 
 
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4067
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4068
+ torch_mem_eff 2.93% 244.444us 30.49% 2.544ms 2.544ms 0.000us 0.00% 6.411ms 6.411ms 1
4069
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.361ms 100.14% 6.361ms 6.361ms 1
4070
+ aten::scaled_dot_product_attention 0.20% 16.791us 1.67% 139.273us 46.424us 0.000us 0.00% 5.684ms 1.895ms 3
4071
+ aten::_scaled_dot_product_efficient_attention 0.23% 19.350us 1.47% 122.482us 40.827us 0.000us 0.00% 5.684ms 1.895ms 3
4072
+ aten::_efficient_attention_forward 0.32% 26.939us 0.96% 79.712us 26.571us 5.684ms 89.48% 5.684ms 1.895ms 3
4073
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 5.684ms 89.48% 5.684ms 1.895ms 3
4074
+ aten::contiguous 0.10% 8.370us 25.37% 2.117ms 235.225us 0.000us 0.00% 726.946us 80.772us 9
4075
+ aten::clone 0.27% 22.301us 25.27% 2.109ms 234.295us 0.000us 0.00% 726.946us 80.772us 9
4076
+ aten::copy_ 0.79% 65.502us 24.38% 2.034ms 226.048us 668.514us 10.52% 726.946us 80.772us 9
4077
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 668.514us 10.52% 668.514us 74.279us 9
4078
+ Activity Buffer Request 20.48% 1.709ms 20.48% 1.709ms 1.709ms 58.432us 0.92% 58.432us 58.432us 1
4079
+ aten::transpose 0.59% 49.601us 0.80% 67.072us 2.795us 0.000us 0.00% 0.000us 0.000us 24
4080
+ aten::as_strided 0.21% 17.471us 0.21% 17.471us 0.728us 0.000us 0.00% 0.000us 0.000us 24
4081
+ aten::empty_like 0.14% 11.518us 0.62% 51.920us 5.769us 0.000us 0.00% 0.000us 0.000us 9
4082
+ aten::empty 0.81% 67.173us 0.81% 67.173us 3.199us 0.000us 0.00% 0.000us 0.000us 21
4083
+ cudaLaunchKernel 3.36% 280.595us 3.36% 280.595us 23.383us 0.000us 0.00% 0.000us 0.000us 12
4084
+ cudaStreamIsCapturing 0.03% 2.391us 0.03% 2.391us 0.797us 0.000us 0.00% 0.000us 0.000us 3
4085
+ cudaFuncSetAttribute 0.03% 2.751us 0.03% 2.751us 0.917us 0.000us 0.00% 0.000us 0.000us 3
4086
+ cudaDeviceSynchronize 69.51% 5.799ms 69.51% 5.799ms 5.799ms 0.000us 0.00% 0.000us 0.000us 1
4087
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4088
+ Self CPU time total: 8.344ms
4089
+ Self CUDA time total: 6.353ms
4090
 
4091
 
4092
 
 
4096
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4097
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4098
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4099
+ torch_mem_eff 2.83% 247.966us 30.03% 2.630ms 2.630ms 0.000us 0.00% 6.745ms 6.745ms 1
4100
+ torch_mem_eff 0.00% 0.000us 0.00% 0.000us 0.000us 6.693ms 100.13% 6.693ms 6.693ms 1
4101
+ aten::scaled_dot_product_attention 0.19% 17.071us 1.57% 137.393us 45.798us 0.000us 0.00% 6.009ms 2.003ms 3
4102
+ aten::_scaled_dot_product_efficient_attention 0.21% 18.029us 1.37% 120.322us 40.107us 0.000us 0.00% 6.009ms 2.003ms 3
4103
+ aten::_efficient_attention_forward 0.30% 26.699us 0.92% 80.822us 26.941us 6.009ms 89.89% 6.009ms 2.003ms 3
4104
+ fmha_cutlassF_bf16_aligned_64x128_rf_sm80(PyTorchMem... 0.00% 0.000us 0.00% 0.000us 0.000us 6.009ms 89.89% 6.009ms 2.003ms 3
4105
+ aten::contiguous 0.09% 8.060us 25.13% 2.201ms 244.542us 0.000us 0.00% 736.293us 81.810us 9
4106
+ aten::clone 0.25% 21.768us 25.04% 2.193ms 243.646us 0.000us 0.00% 736.293us 81.810us 9
4107
+ aten::copy_ 0.76% 66.873us 24.16% 2.115ms 235.039us 675.652us 10.11% 736.293us 81.810us 9
4108
+ void at::native::elementwise_kernel&lt;128, 4, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 675.652us 10.11% 675.652us 75.072us 9
4109
+ Activity Buffer Request 20.46% 1.792ms 20.46% 1.792ms 1.792ms 60.641us 0.91% 60.641us 60.641us 1
4110
+ aten::transpose 0.56% 48.641us 0.74% 65.181us 2.716us 0.000us 0.00% 0.000us 0.000us 24
4111
+ aten::as_strided 0.19% 16.540us 0.19% 16.540us 0.689us 0.000us 0.00% 0.000us 0.000us 24
4112
+ aten::empty_like 0.14% 12.261us 0.64% 55.702us 6.189us 0.000us 0.00% 0.000us 0.000us 9
4113
+ aten::empty 0.78% 68.633us 0.78% 68.633us 3.268us 0.000us 0.00% 0.000us 0.000us 21
4114
+ cudaLaunchKernel 3.20% 280.067us 3.20% 280.067us 23.339us 0.000us 0.00% 0.000us 0.000us 12
4115
+ cudaStreamIsCapturing 0.03% 2.620us 0.03% 2.620us 0.873us 0.000us 0.00% 0.000us 0.000us 3
4116
+ cudaFuncSetAttribute 0.03% 2.860us 0.03% 2.860us 0.953us 0.000us 0.00% 0.000us 0.000us 3
4117
+ cudaDeviceSynchronize 69.97% 6.127ms 69.97% 6.127ms 6.127ms 0.000us 0.00% 0.000us 0.000us 1
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ Self CPU time total: 8.757ms
4120
+ Self CUDA time total: 6.684ms
4121
 
4122
 
4123
  impl wl p50(ms) ok
4124
+ torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
4125
+ torch_mem_eff cuda_attn_L256_bfloat16 1.92 True
4126
+ torch_mem_eff cuda_attn_L320_bfloat16 2.02 True
4127
+ torch_mem_eff cuda_attn_L384_bfloat16 1.99 True
4128
+ torch_mem_eff cuda_attn_L448_bfloat16 2.10 True
4129
+ torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
4130
  </pre></div>
 
 
 
 
 
 
4131
  <div class="cell-artifacts">
4132
  <h4>Artifacts:</h4>
4133
  <a href="artifacts/benchmark/attention.jsonl" class="artifact" target="_blank">attention.jsonl</a>
flash_attn/impls/sage_attention.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 4.72s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3938,24 +3938,22 @@ Cell: benchmark | 4.72s
3938
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3939
  impl wl p50(ms) ok
3940
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3941
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3942
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3943
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3944
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3945
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3946
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3947
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3948
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3949
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3950
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3951
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
3952
  </pre></div>
3953
  <div class="cell-stderr">
3954
- Fetching 8 files: 0%| | 0/8 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
3955
-
3956
- Fetching 8 files: 12%|█▎ | 1/8 [00:00&lt;00:00, 7.67it/s]
3957
- Fetching 8 files: 38%|███▊ | 3/8 [00:00&lt;00:01, 3.86it/s]
3958
- Fetching 8 files: 100%|██████████| 8/8 [00:00&lt;00:00, 10.82it/s]
3959
  </div>
3960
  <div class="cell-artifacts">
3961
  <h4>Artifacts:</h4>
 
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 4.95s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3938
  <div class="cell-stdout"><pre class="stdout-text">Running attention benchmark on cuda with 6 workloads.
3939
  impl wl p50(ms) ok
3940
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
3941
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
3942
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
3943
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
3944
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
3945
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
3946
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
3947
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
3948
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
3949
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
3950
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
3951
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
3952
  </pre></div>
3953
  <div class="cell-stderr">
3954
+ Fetching 8 files: 0%| | 0/8 [00:00&lt;?, ?it/s]
3955
+ Fetching 8 files: 38%|███▊ | 3/8 [00:00&lt;00:01, 3.95it/s]
3956
+ Fetching 8 files: 100%|██████████| 8/8 [00:00&lt;00:00, 10.53it/s]
 
 
3957
  </div>
3958
  <div class="cell-artifacts">
3959
  <h4>Artifacts:</h4>
flash_attn/impls/xformers.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: benchmark | 5.49s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3940,21 +3940,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L128_bfloat16
3940
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3941
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
- xformers_meff 9.60% 449.460us 54.45% 2.550ms 2.550ms 0.000us 0.00% 3.540ms 3.540ms 1
3944
- xformers_flash3::flash_fwd 4.00% 187.356us 44.14% 2.067ms 689.137us 0.000us 0.00% 3.540ms 1.180ms 3
3945
- flash_attn_3::fwd 1.48% 69.234us 40.14% 1.880ms 626.685us 2.646ms 100.00% 3.540ms 1.180ms 3
3946
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.648ms 100.06% 2.648ms 2.648ms 1
3947
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.646ms 100.00% 2.646ms 882.010us 3
3948
- Activity Buffer Request 36.74% 1.721ms 36.74% 1.721ms 1.721ms 894.309us 33.80% 894.309us 894.309us 1
3949
- aten::empty 0.73% 34.410us 0.73% 34.410us 5.735us 0.000us 0.00% 0.000us 0.000us 6
3950
- cudaFuncSetAttribute 0.25% 11.780us 0.25% 11.780us 3.927us 0.000us 0.00% 0.000us 0.000us 3
3951
- cudaLaunchKernel 0.93% 43.670us 0.93% 43.670us 14.557us 0.000us 0.00% 0.000us 0.000us 3
3952
- aten::reshape 0.24% 11.301us 0.72% 33.571us 5.595us 0.000us 0.00% 0.000us 0.000us 6
3953
- aten::view 0.48% 22.270us 0.48% 22.270us 3.712us 0.000us 0.00% 0.000us 0.000us 6
3954
- cudaDeviceSynchronize 45.55% 2.133ms 45.55% 2.133ms 2.133ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
- Self CPU time total: 4.684ms
3957
- Self CUDA time total: 2.646ms
3958
 
3959
 
3960
 
@@ -3964,21 +3964,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L256_bfloat16
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
- xformers_meff 6.53% 314.780us 50.80% 2.448ms 2.448ms 0.000us 0.00% 3.745ms 3.745ms 1
3968
- xformers_flash3::flash_fwd 2.99% 144.051us 43.78% 2.110ms 703.226us 0.000us 0.00% 3.745ms 1.248ms 3
3969
- flash_attn_3::fwd 1.06% 51.161us 40.79% 1.966ms 655.209us 2.793ms 100.00% 3.745ms 1.248ms 3
3970
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.795ms 100.06% 2.795ms 2.795ms 1
3971
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.793ms 100.00% 2.793ms 931.037us 3
3972
- Activity Buffer Request 38.27% 1.844ms 38.27% 1.844ms 1.844ms 952.158us 34.09% 952.158us 952.158us 1
3973
- aten::empty 0.59% 28.641us 0.59% 28.641us 4.774us 0.000us 0.00% 0.000us 0.000us 6
3974
- cudaFuncSetAttribute 0.11% 5.380us 0.11% 5.380us 1.793us 0.000us 0.00% 0.000us 0.000us 3
3975
- cudaLaunchKernel 0.75% 36.051us 0.75% 36.051us 12.017us 0.000us 0.00% 0.000us 0.000us 3
3976
- aten::reshape 0.19% 9.170us 0.49% 23.510us 3.918us 0.000us 0.00% 0.000us 0.000us 6
3977
- aten::view 0.30% 14.340us 0.30% 14.340us 2.390us 0.000us 0.00% 0.000us 0.000us 6
3978
- cudaDeviceSynchronize 49.20% 2.371ms 49.20% 2.371ms 2.371ms 0.000us 0.00% 0.000us 0.000us 1
3979
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3980
- Self CPU time total: 4.819ms
3981
- Self CUDA time total: 2.793ms
3982
 
3983
 
3984
 
@@ -3988,21 +3988,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L320_bfloat16
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
- xformers_meff 6.41% 306.378us 48.23% 2.306ms 2.306ms 0.000us 0.00% 3.879ms 3.879ms 1
3992
- xformers_flash3::flash_fwd 2.97% 141.954us 41.36% 1.977ms 659.046us 0.000us 0.00% 3.879ms 1.293ms 3
3993
- flash_attn_3::fwd 1.09% 51.910us 38.39% 1.835ms 611.728us 2.892ms 100.00% 3.879ms 1.293ms 3
3994
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.893ms 100.06% 2.893ms 2.893ms 1
3995
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.892ms 100.00% 2.892ms 963.882us 3
3996
- Activity Buffer Request 35.83% 1.713ms 35.83% 1.713ms 1.713ms 986.975us 34.13% 986.975us 986.975us 1
3997
- aten::empty 0.60% 28.840us 0.60% 28.840us 4.807us 0.000us 0.00% 0.000us 0.000us 6
3998
- cudaFuncSetAttribute 0.11% 5.330us 0.11% 5.330us 1.777us 0.000us 0.00% 0.000us 0.000us 3
3999
- cudaLaunchKernel 0.75% 36.082us 0.75% 36.082us 12.027us 0.000us 0.00% 0.000us 0.000us 3
4000
- aten::reshape 0.17% 8.059us 0.47% 22.400us 3.733us 0.000us 0.00% 0.000us 0.000us 6
4001
- aten::view 0.30% 14.341us 0.30% 14.341us 2.390us 0.000us 0.00% 0.000us 0.000us 6
4002
- cudaDeviceSynchronize 51.77% 2.475ms 51.77% 2.475ms 2.475ms 0.000us 0.00% 0.000us 0.000us 1
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
- Self CPU time total: 4.781ms
4005
- Self CUDA time total: 2.892ms
4006
 
4007
 
4008
 
@@ -4012,21 +4012,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L384_bfloat16
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4014
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
- xformers_meff 6.14% 305.279us 50.00% 2.487ms 2.487ms 0.000us 0.00% 3.889ms 3.889ms 1
4016
- xformers_flash3::flash_fwd 2.94% 146.052us 43.42% 2.159ms 719.674us 0.000us 0.00% 3.889ms 1.296ms 3
4017
- flash_attn_3::fwd 1.05% 52.012us 40.48% 2.013ms 670.990us 2.906ms 100.00% 3.889ms 1.296ms 3
4018
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.907ms 100.06% 2.907ms 2.907ms 1
4019
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.906ms 100.00% 2.906ms 968.605us 3
4020
- Activity Buffer Request 34.76% 1.728ms 34.76% 1.728ms 1.728ms 983.453us 33.84% 983.453us 983.453us 1
4021
- aten::empty 0.63% 31.322us 0.63% 31.322us 5.220us 0.000us 0.00% 0.000us 0.000us 6
4022
- cudaFuncSetAttribute 0.11% 5.389us 0.11% 5.389us 1.796us 0.000us 0.00% 0.000us 0.000us 3
4023
- cudaLaunchKernel 3.94% 195.844us 3.94% 195.844us 65.281us 0.000us 0.00% 0.000us 0.000us 3
4024
- aten::reshape 0.17% 8.560us 0.45% 22.331us 3.722us 0.000us 0.00% 0.000us 0.000us 6
4025
- aten::view 0.28% 13.771us 0.28% 13.771us 2.295us 0.000us 0.00% 0.000us 0.000us 6
4026
- cudaDeviceSynchronize 50.00% 2.486ms 50.00% 2.486ms 2.486ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
- Self CPU time total: 4.973ms
4029
- Self CUDA time total: 2.906ms
4030
 
4031
 
4032
 
@@ -4036,21 +4036,21 @@ PROFILE TRACE: xformers_meff | cuda_attn_L448_bfloat16
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
- xformers_meff 5.54% 306.968us 45.05% 2.496ms 2.496ms 0.000us 0.00% 4.618ms 4.618ms 1
4040
- xformers_flash3::flash_fwd 2.62% 145.024us 39.11% 2.167ms 722.434us 0.000us 0.00% 4.618ms 1.539ms 3
4041
- flash_attn_3::fwd 0.92% 51.181us 36.50% 2.022ms 674.093us 3.463ms 100.00% 4.618ms 1.539ms 3
4042
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.465ms 100.05% 3.465ms 3.465ms 1
4043
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.463ms 100.00% 3.463ms 1.154ms 3
4044
- Activity Buffer Request 31.42% 1.741ms 31.42% 1.741ms 1.741ms 1.155ms 33.34% 1.155ms 1.155ms 1
4045
- aten::empty 0.54% 29.990us 0.54% 29.990us 4.998us 0.000us 0.00% 0.000us 0.000us 6
4046
- cudaFuncSetAttribute 0.10% 5.350us 0.10% 5.350us 1.783us 0.000us 0.00% 0.000us 0.000us 3
4047
- cudaLaunchKernel 3.51% 194.715us 3.51% 194.715us 64.905us 0.000us 0.00% 0.000us 0.000us 3
4048
- aten::reshape 0.15% 8.420us 0.40% 22.040us 3.673us 0.000us 0.00% 0.000us 0.000us 6
4049
- aten::view 0.25% 13.620us 0.25% 13.620us 2.270us 0.000us 0.00% 0.000us 0.000us 6
4050
- cudaDeviceSynchronize 54.95% 3.045ms 54.95% 3.045ms 3.045ms 0.000us 0.00% 0.000us 0.000us 1
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
- Self CPU time total: 5.541ms
4053
- Self CUDA time total: 3.463ms
4054
 
4055
 
4056
 
@@ -4060,37 +4060,37 @@ PROFILE TRACE: xformers_meff | cuda_attn_L512_bfloat16
4060
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4061
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
- xformers_meff 5.16% 304.966us 48.93% 2.893ms 2.893ms 0.000us 0.00% 4.598ms 4.598ms 1
4064
- xformers_flash3::flash_fwd 9.37% 553.844us 43.37% 2.564ms 854.584us 0.000us 0.00% 4.598ms 1.533ms 3
4065
- flash_attn_3::fwd 0.88% 52.300us 34.00% 2.010ms 669.970us 3.443ms 100.00% 4.598ms 1.533ms 3
4066
- xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.445ms 100.05% 3.445ms 3.445ms 1
4067
- void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.443ms 100.00% 3.443ms 1.148ms 3
4068
- Activity Buffer Request 28.71% 1.697ms 28.71% 1.697ms 1.697ms 1.155ms 33.53% 1.155ms 1.155ms 1
4069
- aten::empty 0.52% 30.653us 0.52% 30.653us 5.109us 0.000us 0.00% 0.000us 0.000us 6
4070
- cudaFuncSetAttribute 0.09% 5.400us 0.09% 5.400us 1.800us 0.000us 0.00% 0.000us 0.000us 3
4071
- cudaLaunchKernel 3.80% 224.365us 3.80% 224.365us 74.788us 0.000us 0.00% 0.000us 0.000us 3
4072
- aten::reshape 0.15% 8.918us 0.40% 23.921us 3.987us 0.000us 0.00% 0.000us 0.000us 6
4073
- aten::view 0.25% 15.003us 0.25% 15.003us 2.501us 0.000us 0.00% 0.000us 0.000us 6
4074
- cudaDeviceSynchronize 51.07% 3.019ms 51.07% 3.019ms 3.019ms 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
- Self CPU time total: 5.912ms
4077
- Self CUDA time total: 3.443ms
4078
 
4079
 
4080
  impl wl p50(ms) ok
4081
- xformers_meff cuda_attn_L128_bfloat16 0.98 True
4082
  xformers_meff cuda_attn_L256_bfloat16 1.04 True
4083
- xformers_meff cuda_attn_L320_bfloat16 1.06 True
4084
- xformers_meff cuda_attn_L384_bfloat16 1.09 True
4085
  xformers_meff cuda_attn_L448_bfloat16 1.26 True
4086
- xformers_meff cuda_attn_L512_bfloat16 1.24 True
4087
  </pre></div>
4088
  <div class="uv-install-logs" id="uv-logs-benchmark">
4089
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4090
  <div class="uv-logs-content" style="display: none;">
4091
  Downloading xformers (111.8MiB)
4092
  Downloaded xformers
4093
- Installed 1 package in 12ms
4094
  </div>
4095
  </div>
4096
  <div class="cell-artifacts">
 
3888
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: benchmark | 5.67s
3892
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3894
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3940
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3941
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3942
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3943
+ xformers_meff 9.78% 468.612us 53.77% 2.576ms 2.576ms 0.000us 0.00% 3.664ms 3.664ms 1
3944
+ xformers_flash3::flash_fwd 4.05% 193.923us 43.19% 2.069ms 689.708us 0.000us 0.00% 3.664ms 1.221ms 3
3945
+ flash_attn_3::fwd 1.52% 72.582us 39.15% 1.875ms 625.067us 2.752ms 100.00% 3.664ms 1.221ms 3
3946
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.754ms 100.05% 2.754ms 2.754ms 1
3947
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.752ms 100.00% 2.752ms 917.464us 3
3948
+ Activity Buffer Request 35.57% 1.704ms 35.57% 1.704ms 1.704ms 911.394us 33.11% 911.394us 911.394us 1
3949
+ aten::empty 0.91% 43.821us 0.91% 43.821us 7.304us 0.000us 0.00% 0.000us 0.000us 6
3950
+ cudaFuncSetAttribute 0.25% 12.121us 0.25% 12.121us 4.040us 0.000us 0.00% 0.000us 0.000us 3
3951
+ cudaLaunchKernel 0.89% 42.701us 0.89% 42.701us 14.234us 0.000us 0.00% 0.000us 0.000us 3
3952
+ aten::reshape 0.31% 15.029us 0.79% 38.050us 6.342us 0.000us 0.00% 0.000us 0.000us 6
3953
+ aten::view 0.48% 23.021us 0.48% 23.021us 3.837us 0.000us 0.00% 0.000us 0.000us 6
3954
+ cudaDeviceSynchronize 46.23% 2.215ms 46.23% 2.215ms 2.215ms 0.000us 0.00% 0.000us 0.000us 1
3955
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3956
+ Self CPU time total: 4.790ms
3957
+ Self CUDA time total: 2.752ms
3958
 
3959
 
3960
 
 
3964
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3965
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3966
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3967
+ xformers_meff 6.55% 315.485us 49.52% 2.386ms 2.386ms 0.000us 0.00% 3.791ms 3.791ms 1
3968
+ xformers_flash3::flash_fwd 2.94% 141.873us 42.50% 2.048ms 682.535us 0.000us 0.00% 3.791ms 1.264ms 3
3969
+ flash_attn_3::fwd 1.10% 52.803us 39.56% 1.906ms 635.244us 2.857ms 100.00% 3.791ms 1.264ms 3
3970
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.858ms 100.05% 2.858ms 2.858ms 1
3971
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.857ms 100.00% 2.857ms 952.327us 3
3972
+ Activity Buffer Request 37.05% 1.785ms 37.05% 1.785ms 1.785ms 933.660us 32.68% 933.660us 933.660us 1
3973
+ aten::empty 0.60% 29.019us 0.60% 29.019us 4.837us 0.000us 0.00% 0.000us 0.000us 6
3974
+ cudaFuncSetAttribute 0.12% 5.710us 0.12% 5.710us 1.903us 0.000us 0.00% 0.000us 0.000us 3
3975
+ cudaLaunchKernel 0.69% 33.350us 0.69% 33.350us 11.117us 0.000us 0.00% 0.000us 0.000us 3
3976
+ aten::reshape 0.18% 8.801us 0.47% 22.752us 3.792us 0.000us 0.00% 0.000us 0.000us 6
3977
+ aten::view 0.29% 13.951us 0.29% 13.951us 2.325us 0.000us 0.00% 0.000us 0.000us 6
3978
+ cudaDeviceSynchronize 50.48% 2.432ms 50.48% 2.432ms 2.432ms 0.000us 0.00% 0.000us 0.000us 1
3979
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3980
+ Self CPU time total: 4.818ms
3981
+ Self CUDA time total: 2.857ms
3982
 
3983
 
3984
 
 
3988
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3989
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3990
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3991
+ xformers_meff 6.44% 303.576us 47.74% 2.252ms 2.252ms 0.000us 0.00% 3.845ms 3.845ms 1
3992
+ xformers_flash3::flash_fwd 3.02% 142.344us 40.83% 1.926ms 641.984us 0.000us 0.00% 3.845ms 1.282ms 3
3993
+ flash_attn_3::fwd 1.11% 52.511us 37.81% 1.784ms 594.536us 2.878ms 100.00% 3.845ms 1.282ms 3
3994
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.880ms 100.05% 2.880ms 2.880ms 1
3995
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.878ms 100.00% 2.878ms 959.487us 3
3996
+ Activity Buffer Request 35.25% 1.663ms 35.25% 1.663ms 1.663ms 967.007us 33.59% 967.007us 967.007us 1
3997
+ aten::empty 0.62% 29.170us 0.62% 29.170us 4.862us 0.000us 0.00% 0.000us 0.000us 6
3998
+ cudaFuncSetAttribute 0.11% 5.320us 0.11% 5.320us 1.773us 0.000us 0.00% 0.000us 0.000us 3
3999
+ cudaLaunchKernel 0.72% 33.781us 0.72% 33.781us 11.260us 0.000us 0.00% 0.000us 0.000us 3
4000
+ aten::reshape 0.18% 8.350us 0.47% 21.990us 3.665us 0.000us 0.00% 0.000us 0.000us 6
4001
+ aten::view 0.29% 13.640us 0.29% 13.640us 2.273us 0.000us 0.00% 0.000us 0.000us 6
4002
+ cudaDeviceSynchronize 52.26% 2.465ms 52.26% 2.465ms 2.465ms 0.000us 0.00% 0.000us 0.000us 1
4003
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4004
+ Self CPU time total: 4.717ms
4005
+ Self CUDA time total: 2.878ms
4006
 
4007
 
4008
 
 
4012
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4013
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4014
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4015
+ xformers_meff 6.01% 303.306us 50.06% 2.525ms 2.525ms 0.000us 0.00% 3.923ms 3.923ms 1
4016
+ xformers_flash3::flash_fwd 2.90% 146.364us 43.59% 2.199ms 733.113us 0.000us 0.00% 3.923ms 1.308ms 3
4017
+ flash_attn_3::fwd 1.02% 51.431us 40.69% 2.053ms 684.325us 2.938ms 100.00% 3.923ms 1.308ms 3
4018
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 2.939ms 100.05% 2.939ms 2.939ms 1
4019
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 2.938ms 100.00% 2.938ms 979.195us 3
4020
+ Activity Buffer Request 34.86% 1.758ms 34.86% 1.758ms 1.758ms 985.691us 33.55% 985.691us 985.691us 1
4021
+ aten::empty 0.57% 28.860us 0.57% 28.860us 4.810us 0.000us 0.00% 0.000us 0.000us 6
4022
+ cudaFuncSetAttribute 0.11% 5.561us 0.11% 5.561us 1.854us 0.000us 0.00% 0.000us 0.000us 3
4023
+ cudaLaunchKernel 4.14% 208.674us 4.14% 208.674us 69.558us 0.000us 0.00% 0.000us 0.000us 3
4024
+ aten::reshape 0.18% 9.230us 0.45% 22.800us 3.800us 0.000us 0.00% 0.000us 0.000us 6
4025
+ aten::view 0.27% 13.570us 0.27% 13.570us 2.262us 0.000us 0.00% 0.000us 0.000us 6
4026
+ cudaDeviceSynchronize 49.94% 2.520ms 49.94% 2.520ms 2.520ms 0.000us 0.00% 0.000us 0.000us 1
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
+ Self CPU time total: 5.045ms
4029
+ Self CUDA time total: 2.938ms
4030
 
4031
 
4032
 
 
4036
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4037
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4038
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4039
+ xformers_meff 5.53% 307.446us 44.37% 2.468ms 2.468ms 0.000us 0.00% 4.694ms 4.694ms 1
4040
+ xformers_flash3::flash_fwd 2.65% 147.575us 38.45% 2.139ms 712.966us 0.000us 0.00% 4.694ms 1.565ms 3
4041
+ flash_attn_3::fwd 0.89% 49.519us 35.79% 1.991ms 663.774us 3.515ms 100.00% 4.694ms 1.565ms 3
4042
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.517ms 100.05% 3.517ms 3.517ms 1
4043
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.515ms 100.00% 3.515ms 1.172ms 3
4044
+ Activity Buffer Request 30.66% 1.706ms 30.66% 1.706ms 1.706ms 1.179ms 33.55% 1.179ms 1.179ms 1
4045
+ aten::empty 0.52% 28.861us 0.52% 28.861us 4.810us 0.000us 0.00% 0.000us 0.000us 6
4046
+ cudaFuncSetAttribute 0.11% 6.000us 0.11% 6.000us 2.000us 0.000us 0.00% 0.000us 0.000us 3
4047
+ cudaLaunchKernel 3.61% 201.015us 3.61% 201.015us 67.005us 0.000us 0.00% 0.000us 0.000us 3
4048
+ aten::reshape 0.15% 8.290us 0.39% 21.930us 3.655us 0.000us 0.00% 0.000us 0.000us 6
4049
+ aten::view 0.25% 13.640us 0.25% 13.640us 2.273us 0.000us 0.00% 0.000us 0.000us 6
4050
+ cudaDeviceSynchronize 55.63% 3.095ms 55.63% 3.095ms 3.095ms 0.000us 0.00% 0.000us 0.000us 1
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
+ Self CPU time total: 5.563ms
4053
+ Self CUDA time total: 3.515ms
4054
 
4055
 
4056
 
 
4060
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4061
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4062
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4063
+ xformers_meff 5.46% 305.147us 45.13% 2.521ms 2.521ms 0.000us 0.00% 4.658ms 4.658ms 1
4064
+ xformers_flash3::flash_fwd 2.65% 147.824us 39.28% 2.194ms 731.306us 0.000us 0.00% 4.658ms 1.553ms 3
4065
+ flash_attn_3::fwd 0.94% 52.350us 36.63% 2.046ms 682.031us 3.488ms 100.00% 4.658ms 1.553ms 3
4066
+ xformers_meff 0.00% 0.000us 0.00% 0.000us 0.000us 3.489ms 100.05% 3.489ms 3.489ms 1
4067
+ void cutlass::device_kernel&lt;flash::enable_sm80_to_sm... 0.00% 0.000us 0.00% 0.000us 0.000us 3.488ms 100.00% 3.488ms 1.163ms 3
4068
+ Activity Buffer Request 31.45% 1.757ms 31.45% 1.757ms 1.757ms 1.171ms 33.57% 1.171ms 1.171ms 1
4069
+ aten::empty 0.54% 29.960us 0.54% 29.960us 4.993us 0.000us 0.00% 0.000us 0.000us 6
4070
+ cudaFuncSetAttribute 0.10% 5.370us 0.10% 5.370us 1.790us 0.000us 0.00% 0.000us 0.000us 3
4071
+ cudaLaunchKernel 3.61% 201.885us 3.61% 201.885us 67.295us 0.000us 0.00% 0.000us 0.000us 3
4072
+ aten::reshape 0.15% 8.170us 0.39% 21.900us 3.650us 0.000us 0.00% 0.000us 0.000us 6
4073
+ aten::view 0.25% 13.730us 0.25% 13.730us 2.288us 0.000us 0.00% 0.000us 0.000us 6
4074
+ cudaDeviceSynchronize 54.87% 3.065ms 54.87% 3.065ms 3.065ms 0.000us 0.00% 0.000us 0.000us 1
4075
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4076
+ Self CPU time total: 5.586ms
4077
+ Self CUDA time total: 3.488ms
4078
 
4079
 
4080
  impl wl p50(ms) ok
4081
+ xformers_meff cuda_attn_L128_bfloat16 0.99 True
4082
  xformers_meff cuda_attn_L256_bfloat16 1.04 True
4083
+ xformers_meff cuda_attn_L320_bfloat16 1.07 True
4084
+ xformers_meff cuda_attn_L384_bfloat16 1.08 True
4085
  xformers_meff cuda_attn_L448_bfloat16 1.26 True
4086
+ xformers_meff cuda_attn_L512_bfloat16 1.25 True
4087
  </pre></div>
4088
  <div class="uv-install-logs" id="uv-logs-benchmark">
4089
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4090
  <div class="uv-logs-content" style="display: none;">
4091
  Downloading xformers (111.8MiB)
4092
  Downloaded xformers
4093
+ Installed 1 package in 11ms
4094
  </div>
4095
  </div>
4096
  <div class="cell-artifacts">
flash_attn/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 0f160f3f11d41b3a388cb9ab3a3ed23dc9ca473cb8531e7d3dc53c94cc97ebd0
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB

Git LFS Details

  • SHA256: 4c6091c233ea3ade4488d4a47b74639c4507900bcc055a5d5ec3d4f9d3262f2b
  • Pointer size: 130 Bytes
  • Size of remote file: 24.8 kB
flash_attn/results/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:55:48.469348</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -3999,96 +3999,96 @@ body[data-tool="eraser"] .main-content {
3999
  <g id="matplotlib.axis_2">
4000
  <g id="ytick_1">
4001
  <g id="grid-y--2" class="grid grid-y">
4002
- <path d="M 47.81 402.388331 L 835.361742 402.388331 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4003
  </g>
4004
  <g id="line2d_7">
4005
  <defs>
4006
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4007
  </defs>
4008
  <g>
4009
- <use ns4:href="#m0fca2865ba" x="47.81" y="402.388331" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="406.18755" transform="rotate(-0 40.81 406.18755)">1.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_2">
4017
  <g id="grid-y--3" class="grid grid-y">
4018
- <path d="M 47.81 341.392024 L 835.361742 341.392024 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
- <use ns4:href="#m0fca2865ba" x="47.81" y="341.392024" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="345.191243" transform="rotate(-0 40.81 345.191243)">1.2</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_3">
4030
  <g id="grid-y--4" class="grid grid-y">
4031
- <path d="M 47.81 280.395718 L 835.361742 280.395718 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="47.81" y="280.395718" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="284.194936" transform="rotate(-0 40.81 284.194936)">1.4</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_4">
4043
  <g id="grid-y--5" class="grid grid-y">
4044
- <path d="M 47.81 219.399411 L 835.361742 219.399411 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
- <use ns4:href="#m0fca2865ba" x="47.81" y="219.399411" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="223.198629" transform="rotate(-0 40.81 223.198629)">1.6</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_5">
4056
  <g id="grid-y--6" class="grid grid-y">
4057
- <path d="M 47.81 158.403104 L 835.361742 158.403104 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
- <use ns4:href="#m0fca2865ba" x="47.81" y="158.403104" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="162.202323" transform="rotate(-0 40.81 162.202323)">1.8</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_6">
4069
  <g id="grid-y--7" class="grid grid-y">
4070
- <path d="M 47.81 97.406797 L 835.361742 97.406797 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
- <use ns4:href="#m0fca2865ba" x="47.81" y="97.406797" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="101.206016" transform="rotate(-0 40.81 101.206016)">2.0</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_7">
4082
  <g id="grid-y--8" class="grid grid-y">
4083
- <path d="M 47.81 36.41049 L 835.361742 36.41049 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
- <use ns4:href="#m0fca2865ba" x="47.81" y="36.41049" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="40.209709" transform="rotate(-0 40.81 40.209709)">2.2</text>
4092
  </g>
4093
  </g>
4094
  <g id="label--y" class="ylabel">
@@ -4096,73 +4096,73 @@ body[data-tool="eraser"] .main-content {
4096
  </g>
4097
  </g>
4098
  <g id="series--torch-flash-ma" class="series">
4099
- <path d="M 83.607806 337.885652 L 226.799032 325.671141 L 369.990258 317.408887 L 513.181484 307.237447 L 656.37271 265.438813 L 799.563935 253.93491 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4100
  <defs>
4101
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4102
  </defs>
4103
  <g clip-path="url(#p09feef2583)">
4104
- <use ns4:href="#md7efaf3aec" x="83.607806" y="337.885652" style="fill: #1f77b4; stroke: #1f77b4" />
4105
- <use ns4:href="#md7efaf3aec" x="226.799032" y="325.671141" style="fill: #1f77b4; stroke: #1f77b4" />
4106
- <use ns4:href="#md7efaf3aec" x="369.990258" y="317.408887" style="fill: #1f77b4; stroke: #1f77b4" />
4107
- <use ns4:href="#md7efaf3aec" x="513.181484" y="307.237447" style="fill: #1f77b4; stroke: #1f77b4" />
4108
- <use ns4:href="#md7efaf3aec" x="656.37271" y="265.438813" style="fill: #1f77b4; stroke: #1f77b4" />
4109
- <use ns4:href="#md7efaf3aec" x="799.563935" y="253.93491" style="fill: #1f77b4; stroke: #1f77b4" />
4110
  </g>
4111
  </g>
4112
  <g id="series--torch-mem-eff" class="series">
4113
- <path d="M 83.607806 150.723364 L 226.799032 118.266314 L 369.990258 111.819309 L 513.181484 85.541185 L 656.37271 73.225726 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4114
  <defs>
4115
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4116
  </defs>
4117
  <g clip-path="url(#p09feef2583)">
4118
- <use ns4:href="#m9b8c54d372" x="83.607806" y="150.723364" style="fill: #ff7f0e; stroke: #ff7f0e" />
4119
- <use ns4:href="#m9b8c54d372" x="226.799032" y="118.266314" style="fill: #ff7f0e; stroke: #ff7f0e" />
4120
- <use ns4:href="#m9b8c54d372" x="369.990258" y="111.819309" style="fill: #ff7f0e; stroke: #ff7f0e" />
4121
- <use ns4:href="#m9b8c54d372" x="513.181484" y="85.541185" style="fill: #ff7f0e; stroke: #ff7f0e" />
4122
- <use ns4:href="#m9b8c54d372" x="656.37271" y="73.225726" style="fill: #ff7f0e; stroke: #ff7f0e" />
4123
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4124
  </g>
4125
  </g>
4126
  <g id="series--xformers-meff" class="series">
4127
- <path d="M 83.607806 408.996061 L 226.799032 390.330886 L 369.990258 383.840574 L 513.181484 376.133386 L 656.37271 322.193132 L 799.563935 327.713603 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4128
  <defs>
4129
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4130
  </defs>
4131
  <g clip-path="url(#p09feef2583)">
4132
- <use ns4:href="#mc655281e0b" x="83.607806" y="408.996061" style="fill: #2ca02c; stroke: #2ca02c" />
4133
- <use ns4:href="#mc655281e0b" x="226.799032" y="390.330886" style="fill: #2ca02c; stroke: #2ca02c" />
4134
- <use ns4:href="#mc655281e0b" x="369.990258" y="383.840574" style="fill: #2ca02c; stroke: #2ca02c" />
4135
- <use ns4:href="#mc655281e0b" x="513.181484" y="376.133386" style="fill: #2ca02c; stroke: #2ca02c" />
4136
- <use ns4:href="#mc655281e0b" x="656.37271" y="322.193132" style="fill: #2ca02c; stroke: #2ca02c" />
4137
- <use ns4:href="#mc655281e0b" x="799.563935" y="327.713603" style="fill: #2ca02c; stroke: #2ca02c" />
4138
  </g>
4139
  </g>
4140
  <g id="series--hf-kernels-flash-attn" class="series">
4141
- <path d="M 83.607806 417.466618 L 226.799032 402.775353 L 369.990258 391.490731 L 513.181484 383.472462 L 656.37271 336.492496 L 799.563935 330.032377 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4142
  <defs>
4143
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4144
  </defs>
4145
  <g clip-path="url(#p09feef2583)">
4146
- <use ns4:href="#m61c8040d7e" x="83.607806" y="417.466618" style="fill: #d62728; stroke: #d62728" />
4147
- <use ns4:href="#m61c8040d7e" x="226.799032" y="402.775353" style="fill: #d62728; stroke: #d62728" />
4148
- <use ns4:href="#m61c8040d7e" x="369.990258" y="391.490731" style="fill: #d62728; stroke: #d62728" />
4149
- <use ns4:href="#m61c8040d7e" x="513.181484" y="383.472462" style="fill: #d62728; stroke: #d62728" />
4150
- <use ns4:href="#m61c8040d7e" x="656.37271" y="336.492496" style="fill: #d62728; stroke: #d62728" />
4151
- <use ns4:href="#m61c8040d7e" x="799.563935" y="330.032377" style="fill: #d62728; stroke: #d62728" />
4152
  </g>
4153
  </g>
4154
  <g id="series--hf-kernels-flash-attn3" class="series">
4155
- <path d="M 83.607806 428.387702 L 226.799032 409.68288 L 369.990258 400.234552 L 513.181484 400.520929 L 656.37271 346.614528 L 799.563935 349.227915 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4156
  <defs>
4157
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4158
  </defs>
4159
  <g clip-path="url(#p09feef2583)">
4160
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4161
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="409.68288" style="fill: #9467bd; stroke: #9467bd" />
4162
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="400.234552" style="fill: #9467bd; stroke: #9467bd" />
4163
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="400.520929" style="fill: #9467bd; stroke: #9467bd" />
4164
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="346.614528" style="fill: #9467bd; stroke: #9467bd" />
4165
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="349.227915" style="fill: #9467bd; stroke: #9467bd" />
4166
  </g>
4167
  </g>
4168
  <g id="patch_3">
@@ -4247,7 +4247,7 @@ body[data-tool="eraser"] .main-content {
4247
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4248
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4249
  </span> |
4250
- Cell: combine | 4.68s
4251
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4252
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4253
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4356,48 +4356,48 @@ Summary: 6 found, 0 skipped, 0 missing
4356
  COMBINED BENCHMARK SUMMARY
4357
 
4358
  impl wl p50(ms) ok
4359
- hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.95 True
4360
- hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.00 True
4361
- hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.04 True
4362
- hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.06 True
4363
- hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.22 True
4364
- hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.24 True
4365
- hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.91 True
4366
  hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4367
  hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
4368
- hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.01 True
4369
- hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.18 True
4370
- hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.17 True
4371
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4372
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4373
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4374
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4375
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4376
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4377
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4378
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4379
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4380
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4381
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4382
- Error: module &#x27;sage_attention_d202bc414c936d8&#x27; has no attribute &#x27;fwd&#x27;
4383
  torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4384
- torch_flash_ma cuda_attn_L256_bfloat16 1.25 True
4385
- torch_flash_ma cuda_attn_L320_bfloat16 1.28 True
4386
- torch_flash_ma cuda_attn_L384_bfloat16 1.31 True
4387
- torch_flash_ma cuda_attn_L448_bfloat16 1.45 True
4388
- torch_flash_ma cuda_attn_L512_bfloat16 1.49 True
4389
- torch_mem_eff cuda_attn_L128_bfloat16 1.83 True
4390
- torch_mem_eff cuda_attn_L256_bfloat16 1.93 True
4391
- torch_mem_eff cuda_attn_L320_bfloat16 1.95 True
4392
- torch_mem_eff cuda_attn_L384_bfloat16 2.04 True
4393
- torch_mem_eff cuda_attn_L448_bfloat16 2.08 True
4394
- torch_mem_eff cuda_attn_L512_bfloat16 2.17 True
4395
- xformers_meff cuda_attn_L128_bfloat16 0.98 True
4396
  xformers_meff cuda_attn_L256_bfloat16 1.04 True
4397
- xformers_meff cuda_attn_L320_bfloat16 1.06 True
4398
- xformers_meff cuda_attn_L384_bfloat16 1.09 True
4399
  xformers_meff cuda_attn_L448_bfloat16 1.26 True
4400
- xformers_meff cuda_attn_L512_bfloat16 1.24 True
4401
 
4402
  GENERATING COMBINED VISUALIZATION
4403
 
@@ -4421,7 +4421,7 @@ Implementations included:
4421
  <div class="uv-install-logs" id="uv-logs-combine">
4422
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4423
  <div class="uv-logs-content" style="display: none;">
4424
- Installed 37 packages in 206ms
4425
  </div>
4426
  </div>
4427
  <div class="cell-artifacts">
@@ -4434,7 +4434,7 @@ Installed 37 packages in 206ms
4434
  <rdf:RDF>
4435
  <ns2:Work>
4436
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4437
- <dc:date>2025-12-19T19:55:48.469348</dc:date>
4438
  <dc:format>image/svg+xml</dc:format>
4439
  <dc:creator>
4440
  <ns2:Agent>
@@ -4544,96 +4544,96 @@ Installed 37 packages in 206ms
4544
  <g id="matplotlib.axis_2">
4545
  <g id="ytick_1">
4546
  <g id="grid-y--2" class="grid grid-y">
4547
- <path d="M 47.81 402.388331 L 835.361742 402.388331 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4548
  </g>
4549
  <g id="line2d_7">
4550
  <defs>
4551
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4552
  </defs>
4553
  <g>
4554
- <use ns4:href="#m0fca2865ba" x="47.81" y="402.388331" style="stroke: #000000; stroke-width: 0.8" />
4555
  </g>
4556
  </g>
4557
  <g id="text_7">
4558
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="406.18755" transform="rotate(-0 40.81 406.18755)">1.0</text>
4559
  </g>
4560
  </g>
4561
  <g id="ytick_2">
4562
  <g id="grid-y--3" class="grid grid-y">
4563
- <path d="M 47.81 341.392024 L 835.361742 341.392024 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4564
  </g>
4565
  <g id="line2d_8">
4566
  <g>
4567
- <use ns4:href="#m0fca2865ba" x="47.81" y="341.392024" style="stroke: #000000; stroke-width: 0.8" />
4568
  </g>
4569
  </g>
4570
  <g id="text_8">
4571
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="345.191243" transform="rotate(-0 40.81 345.191243)">1.2</text>
4572
  </g>
4573
  </g>
4574
  <g id="ytick_3">
4575
  <g id="grid-y--4" class="grid grid-y">
4576
- <path d="M 47.81 280.395718 L 835.361742 280.395718 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4577
  </g>
4578
  <g id="line2d_9">
4579
  <g>
4580
- <use ns4:href="#m0fca2865ba" x="47.81" y="280.395718" style="stroke: #000000; stroke-width: 0.8" />
4581
  </g>
4582
  </g>
4583
  <g id="text_9">
4584
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="284.194936" transform="rotate(-0 40.81 284.194936)">1.4</text>
4585
  </g>
4586
  </g>
4587
  <g id="ytick_4">
4588
  <g id="grid-y--5" class="grid grid-y">
4589
- <path d="M 47.81 219.399411 L 835.361742 219.399411 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4590
  </g>
4591
  <g id="line2d_10">
4592
  <g>
4593
- <use ns4:href="#m0fca2865ba" x="47.81" y="219.399411" style="stroke: #000000; stroke-width: 0.8" />
4594
  </g>
4595
  </g>
4596
  <g id="text_10">
4597
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="223.198629" transform="rotate(-0 40.81 223.198629)">1.6</text>
4598
  </g>
4599
  </g>
4600
  <g id="ytick_5">
4601
  <g id="grid-y--6" class="grid grid-y">
4602
- <path d="M 47.81 158.403104 L 835.361742 158.403104 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4603
  </g>
4604
  <g id="line2d_11">
4605
  <g>
4606
- <use ns4:href="#m0fca2865ba" x="47.81" y="158.403104" style="stroke: #000000; stroke-width: 0.8" />
4607
  </g>
4608
  </g>
4609
  <g id="text_11">
4610
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="162.202323" transform="rotate(-0 40.81 162.202323)">1.8</text>
4611
  </g>
4612
  </g>
4613
  <g id="ytick_6">
4614
  <g id="grid-y--7" class="grid grid-y">
4615
- <path d="M 47.81 97.406797 L 835.361742 97.406797 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4616
  </g>
4617
  <g id="line2d_12">
4618
  <g>
4619
- <use ns4:href="#m0fca2865ba" x="47.81" y="97.406797" style="stroke: #000000; stroke-width: 0.8" />
4620
  </g>
4621
  </g>
4622
  <g id="text_12">
4623
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="101.206016" transform="rotate(-0 40.81 101.206016)">2.0</text>
4624
  </g>
4625
  </g>
4626
  <g id="ytick_7">
4627
  <g id="grid-y--8" class="grid grid-y">
4628
- <path d="M 47.81 36.41049 L 835.361742 36.41049 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4629
  </g>
4630
  <g id="line2d_13">
4631
  <g>
4632
- <use ns4:href="#m0fca2865ba" x="47.81" y="36.41049" style="stroke: #000000; stroke-width: 0.8" />
4633
  </g>
4634
  </g>
4635
  <g id="text_13">
4636
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="40.209709" transform="rotate(-0 40.81 40.209709)">2.2</text>
4637
  </g>
4638
  </g>
4639
  <g id="label--y" class="ylabel">
@@ -4641,73 +4641,73 @@ Installed 37 packages in 206ms
4641
  </g>
4642
  </g>
4643
  <g id="series--torch-flash-ma" class="series">
4644
- <path d="M 83.607806 337.885652 L 226.799032 325.671141 L 369.990258 317.408887 L 513.181484 307.237447 L 656.37271 265.438813 L 799.563935 253.93491 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4645
  <defs>
4646
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4647
  </defs>
4648
  <g clip-path="url(#p09feef2583)">
4649
- <use ns4:href="#md7efaf3aec" x="83.607806" y="337.885652" style="fill: #1f77b4; stroke: #1f77b4" />
4650
- <use ns4:href="#md7efaf3aec" x="226.799032" y="325.671141" style="fill: #1f77b4; stroke: #1f77b4" />
4651
- <use ns4:href="#md7efaf3aec" x="369.990258" y="317.408887" style="fill: #1f77b4; stroke: #1f77b4" />
4652
- <use ns4:href="#md7efaf3aec" x="513.181484" y="307.237447" style="fill: #1f77b4; stroke: #1f77b4" />
4653
- <use ns4:href="#md7efaf3aec" x="656.37271" y="265.438813" style="fill: #1f77b4; stroke: #1f77b4" />
4654
- <use ns4:href="#md7efaf3aec" x="799.563935" y="253.93491" style="fill: #1f77b4; stroke: #1f77b4" />
4655
  </g>
4656
  </g>
4657
  <g id="series--torch-mem-eff" class="series">
4658
- <path d="M 83.607806 150.723364 L 226.799032 118.266314 L 369.990258 111.819309 L 513.181484 85.541185 L 656.37271 73.225726 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4659
  <defs>
4660
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4661
  </defs>
4662
  <g clip-path="url(#p09feef2583)">
4663
- <use ns4:href="#m9b8c54d372" x="83.607806" y="150.723364" style="fill: #ff7f0e; stroke: #ff7f0e" />
4664
- <use ns4:href="#m9b8c54d372" x="226.799032" y="118.266314" style="fill: #ff7f0e; stroke: #ff7f0e" />
4665
- <use ns4:href="#m9b8c54d372" x="369.990258" y="111.819309" style="fill: #ff7f0e; stroke: #ff7f0e" />
4666
- <use ns4:href="#m9b8c54d372" x="513.181484" y="85.541185" style="fill: #ff7f0e; stroke: #ff7f0e" />
4667
- <use ns4:href="#m9b8c54d372" x="656.37271" y="73.225726" style="fill: #ff7f0e; stroke: #ff7f0e" />
4668
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4669
  </g>
4670
  </g>
4671
  <g id="series--xformers-meff" class="series">
4672
- <path d="M 83.607806 408.996061 L 226.799032 390.330886 L 369.990258 383.840574 L 513.181484 376.133386 L 656.37271 322.193132 L 799.563935 327.713603 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4673
  <defs>
4674
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4675
  </defs>
4676
  <g clip-path="url(#p09feef2583)">
4677
- <use ns4:href="#mc655281e0b" x="83.607806" y="408.996061" style="fill: #2ca02c; stroke: #2ca02c" />
4678
- <use ns4:href="#mc655281e0b" x="226.799032" y="390.330886" style="fill: #2ca02c; stroke: #2ca02c" />
4679
- <use ns4:href="#mc655281e0b" x="369.990258" y="383.840574" style="fill: #2ca02c; stroke: #2ca02c" />
4680
- <use ns4:href="#mc655281e0b" x="513.181484" y="376.133386" style="fill: #2ca02c; stroke: #2ca02c" />
4681
- <use ns4:href="#mc655281e0b" x="656.37271" y="322.193132" style="fill: #2ca02c; stroke: #2ca02c" />
4682
- <use ns4:href="#mc655281e0b" x="799.563935" y="327.713603" style="fill: #2ca02c; stroke: #2ca02c" />
4683
  </g>
4684
  </g>
4685
  <g id="series--hf-kernels-flash-attn" class="series">
4686
- <path d="M 83.607806 417.466618 L 226.799032 402.775353 L 369.990258 391.490731 L 513.181484 383.472462 L 656.37271 336.492496 L 799.563935 330.032377 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4687
  <defs>
4688
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4689
  </defs>
4690
  <g clip-path="url(#p09feef2583)">
4691
- <use ns4:href="#m61c8040d7e" x="83.607806" y="417.466618" style="fill: #d62728; stroke: #d62728" />
4692
- <use ns4:href="#m61c8040d7e" x="226.799032" y="402.775353" style="fill: #d62728; stroke: #d62728" />
4693
- <use ns4:href="#m61c8040d7e" x="369.990258" y="391.490731" style="fill: #d62728; stroke: #d62728" />
4694
- <use ns4:href="#m61c8040d7e" x="513.181484" y="383.472462" style="fill: #d62728; stroke: #d62728" />
4695
- <use ns4:href="#m61c8040d7e" x="656.37271" y="336.492496" style="fill: #d62728; stroke: #d62728" />
4696
- <use ns4:href="#m61c8040d7e" x="799.563935" y="330.032377" style="fill: #d62728; stroke: #d62728" />
4697
  </g>
4698
  </g>
4699
  <g id="series--hf-kernels-flash-attn3" class="series">
4700
- <path d="M 83.607806 428.387702 L 226.799032 409.68288 L 369.990258 400.234552 L 513.181484 400.520929 L 656.37271 346.614528 L 799.563935 349.227915 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4701
  <defs>
4702
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4703
  </defs>
4704
  <g clip-path="url(#p09feef2583)">
4705
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4706
- <use ns4:href="#m7cd35be9cc" x="226.799032" y="409.68288" style="fill: #9467bd; stroke: #9467bd" />
4707
- <use ns4:href="#m7cd35be9cc" x="369.990258" y="400.234552" style="fill: #9467bd; stroke: #9467bd" />
4708
- <use ns4:href="#m7cd35be9cc" x="513.181484" y="400.520929" style="fill: #9467bd; stroke: #9467bd" />
4709
- <use ns4:href="#m7cd35be9cc" x="656.37271" y="346.614528" style="fill: #9467bd; stroke: #9467bd" />
4710
- <use ns4:href="#m7cd35be9cc" x="799.563935" y="349.227915" style="fill: #9467bd; stroke: #9467bd" />
4711
  </g>
4712
  </g>
4713
  <g id="patch_3">
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T23:02:45.375383</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
3999
  <g id="matplotlib.axis_2">
4000
  <g id="ytick_1">
4001
  <g id="grid-y--2" class="grid grid-y">
4002
+ <path d="M 47.81 407.59176 L 835.361742 407.59176 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4003
  </g>
4004
  <g id="line2d_7">
4005
  <defs>
4006
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4007
  </defs>
4008
  <g>
4009
+ <use ns4:href="#m0fca2865ba" x="47.81" y="407.59176" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="411.390978" transform="rotate(-0 40.81 411.390978)">1.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_2">
4017
  <g id="grid-y--3" class="grid grid-y">
4018
+ <path d="M 47.81 349.696597 L 835.361742 349.696597 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
+ <use ns4:href="#m0fca2865ba" x="47.81" y="349.696597" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="353.495815" transform="rotate(-0 40.81 353.495815)">1.2</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_3">
4030
  <g id="grid-y--4" class="grid grid-y">
4031
+ <path d="M 47.81 291.801434 L 835.361742 291.801434 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="47.81" y="291.801434" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="295.600653" transform="rotate(-0 40.81 295.600653)">1.4</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_4">
4043
  <g id="grid-y--5" class="grid grid-y">
4044
+ <path d="M 47.81 233.906271 L 835.361742 233.906271 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
+ <use ns4:href="#m0fca2865ba" x="47.81" y="233.906271" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="237.70549" transform="rotate(-0 40.81 237.70549)">1.6</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_5">
4056
  <g id="grid-y--6" class="grid grid-y">
4057
+ <path d="M 47.81 176.011108 L 835.361742 176.011108 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
+ <use ns4:href="#m0fca2865ba" x="47.81" y="176.011108" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="179.810327" transform="rotate(-0 40.81 179.810327)">1.8</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_6">
4069
  <g id="grid-y--7" class="grid grid-y">
4070
+ <path d="M 47.81 118.115945 L 835.361742 118.115945 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
+ <use ns4:href="#m0fca2865ba" x="47.81" y="118.115945" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.915164" transform="rotate(-0 40.81 121.915164)">2.0</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_7">
4082
  <g id="grid-y--8" class="grid grid-y">
4083
+ <path d="M 47.81 60.220782 L 835.361742 60.220782 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
+ <use ns4:href="#m0fca2865ba" x="47.81" y="60.220782" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="64.020001" transform="rotate(-0 40.81 64.020001)">2.2</text>
4092
  </g>
4093
  </g>
4094
  <g id="label--y" class="ylabel">
 
4096
  </g>
4097
  </g>
4098
  <g id="series--torch-flash-ma" class="series">
4099
+ <path d="M 83.607806 345.769568 L 226.799032 327.905436 L 369.990258 321.175124 L 513.181484 310.733152 L 656.37271 266.384878 L 799.563935 257.816105 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4100
  <defs>
4101
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4102
  </defs>
4103
  <g clip-path="url(#p09feef2583)">
4104
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="345.769568" style="fill: #1f77b4; stroke: #1f77b4" />
4105
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="327.905436" style="fill: #1f77b4; stroke: #1f77b4" />
4106
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="321.175124" style="fill: #1f77b4; stroke: #1f77b4" />
4107
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="310.733152" style="fill: #1f77b4; stroke: #1f77b4" />
4108
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="266.384878" style="fill: #1f77b4; stroke: #1f77b4" />
4109
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="257.816105" style="fill: #1f77b4; stroke: #1f77b4" />
4110
  </g>
4111
  </g>
4112
  <g id="series--torch-mem-eff" class="series">
4113
+ <path d="M 83.607806 158.269135 L 226.799032 141.655829 L 369.990258 111.190815 L 513.181484 119.895642 L 656.37271 90.298767 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4114
  <defs>
4115
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4116
  </defs>
4117
  <g clip-path="url(#p09feef2583)">
4118
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="158.269135" style="fill: #ff7f0e; stroke: #ff7f0e" />
4119
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="141.655829" style="fill: #ff7f0e; stroke: #ff7f0e" />
4120
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="111.190815" style="fill: #ff7f0e; stroke: #ff7f0e" />
4121
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="119.895642" style="fill: #ff7f0e; stroke: #ff7f0e" />
4122
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="90.298767" style="fill: #ff7f0e; stroke: #ff7f0e" />
4123
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4124
  </g>
4125
  </g>
4126
  <g id="series--xformers-meff" class="series">
4127
+ <path d="M 83.607806 410.251753 L 226.799032 397.439264 L 369.990258 386.870212 L 513.181484 384.759934 L 656.37271 333.145238 L 799.563935 334.337878 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4128
  <defs>
4129
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4130
  </defs>
4131
  <g clip-path="url(#p09feef2583)">
4132
+ <use ns4:href="#mc655281e0b" x="83.607806" y="410.251753" style="fill: #2ca02c; stroke: #2ca02c" />
4133
+ <use ns4:href="#mc655281e0b" x="226.799032" y="397.439264" style="fill: #2ca02c; stroke: #2ca02c" />
4134
+ <use ns4:href="#mc655281e0b" x="369.990258" y="386.870212" style="fill: #2ca02c; stroke: #2ca02c" />
4135
+ <use ns4:href="#mc655281e0b" x="513.181484" y="384.759934" style="fill: #2ca02c; stroke: #2ca02c" />
4136
+ <use ns4:href="#mc655281e0b" x="656.37271" y="333.145238" style="fill: #2ca02c; stroke: #2ca02c" />
4137
+ <use ns4:href="#mc655281e0b" x="799.563935" y="334.337878" style="fill: #2ca02c; stroke: #2ca02c" />
4138
  </g>
4139
  </g>
4140
  <g id="series--hf-kernels-flash-attn" class="series">
4141
+ <path d="M 83.607806 417.931836 L 226.799032 403.327202 L 369.990258 389.605759 L 513.181484 383.179396 L 656.37271 337.13711 L 799.563935 336.430789 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4142
  <defs>
4143
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4144
  </defs>
4145
  <g clip-path="url(#p09feef2583)">
4146
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="417.931836" style="fill: #d62728; stroke: #d62728" />
4147
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="403.327202" style="fill: #d62728; stroke: #d62728" />
4148
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="389.605759" style="fill: #d62728; stroke: #d62728" />
4149
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="383.179396" style="fill: #d62728; stroke: #d62728" />
4150
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="337.13711" style="fill: #d62728; stroke: #d62728" />
4151
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="336.430789" style="fill: #d62728; stroke: #d62728" />
4152
  </g>
4153
  </g>
4154
  <g id="series--hf-kernels-flash-attn3" class="series">
4155
+ <path d="M 83.607806 428.387702 L 226.799032 413.453355 L 369.990258 405.098794 L 513.181484 402.988515 L 656.37271 353.264386 L 799.563935 343.383998 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4156
  <defs>
4157
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4158
  </defs>
4159
  <g clip-path="url(#p09feef2583)">
4160
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4161
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="413.453355" style="fill: #9467bd; stroke: #9467bd" />
4162
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="405.098794" style="fill: #9467bd; stroke: #9467bd" />
4163
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="402.988515" style="fill: #9467bd; stroke: #9467bd" />
4164
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="353.264386" style="fill: #9467bd; stroke: #9467bd" />
4165
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="343.383998" style="fill: #9467bd; stroke: #9467bd" />
4166
  </g>
4167
  </g>
4168
  <g id="patch_3">
 
4247
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4248
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4249
  </span> |
4250
+ Cell: combine | 4.45s
4251
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4252
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4253
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4356
  COMBINED BENCHMARK SUMMARY
4357
 
4358
  impl wl p50(ms) ok
4359
+ hf_kernels_flash_attn cuda_attn_L128_bfloat16 0.96 True
4360
+ hf_kernels_flash_attn cuda_attn_L256_bfloat16 1.01 True
4361
+ hf_kernels_flash_attn cuda_attn_L320_bfloat16 1.06 True
4362
+ hf_kernels_flash_attn cuda_attn_L384_bfloat16 1.08 True
4363
+ hf_kernels_flash_attn cuda_attn_L448_bfloat16 1.24 True
4364
+ hf_kernels_flash_attn cuda_attn_L512_bfloat16 1.25 True
4365
+ hf_kernels_flash_attn3 cuda_attn_L128_bfloat16 0.93 True
4366
  hf_kernels_flash_attn3 cuda_attn_L256_bfloat16 0.98 True
4367
  hf_kernels_flash_attn3 cuda_attn_L320_bfloat16 1.01 True
4368
+ hf_kernels_flash_attn3 cuda_attn_L384_bfloat16 1.02 True
4369
+ hf_kernels_flash_attn3 cuda_attn_L448_bfloat16 1.19 True
4370
+ hf_kernels_flash_attn3 cuda_attn_L512_bfloat16 1.22 True
4371
  sage_int8_fp16 cuda_attn_L128_bfloat16 FAIL False
4372
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
4373
  sage_int8_fp16 cuda_attn_L256_bfloat16 FAIL False
4374
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
4375
  sage_int8_fp16 cuda_attn_L320_bfloat16 FAIL False
4376
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
4377
  sage_int8_fp16 cuda_attn_L384_bfloat16 FAIL False
4378
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
4379
  sage_int8_fp16 cuda_attn_L448_bfloat16 FAIL False
4380
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
4381
  sage_int8_fp16 cuda_attn_L512_bfloat16 FAIL False
4382
+ Error: module &#x27;sage_attention_b91c5fb7ee1dcfba&#x27; has no attribute &#x27;fwd&#x27;
4383
  torch_flash_ma cuda_attn_L128_bfloat16 1.21 True
4384
+ torch_flash_ma cuda_attn_L256_bfloat16 1.28 True
4385
+ torch_flash_ma cuda_attn_L320_bfloat16 1.30 True
4386
+ torch_flash_ma cuda_attn_L384_bfloat16 1.33 True
4387
+ torch_flash_ma cuda_attn_L448_bfloat16 1.49 True
4388
+ torch_flash_ma cuda_attn_L512_bfloat16 1.52 True
4389
+ torch_mem_eff cuda_attn_L128_bfloat16 1.86 True
4390
+ torch_mem_eff cuda_attn_L256_bfloat16 1.92 True
4391
+ torch_mem_eff cuda_attn_L320_bfloat16 2.02 True
4392
+ torch_mem_eff cuda_attn_L384_bfloat16 1.99 True
4393
+ torch_mem_eff cuda_attn_L448_bfloat16 2.10 True
4394
+ torch_mem_eff cuda_attn_L512_bfloat16 2.25 True
4395
+ xformers_meff cuda_attn_L128_bfloat16 0.99 True
4396
  xformers_meff cuda_attn_L256_bfloat16 1.04 True
4397
+ xformers_meff cuda_attn_L320_bfloat16 1.07 True
4398
+ xformers_meff cuda_attn_L384_bfloat16 1.08 True
4399
  xformers_meff cuda_attn_L448_bfloat16 1.26 True
4400
+ xformers_meff cuda_attn_L512_bfloat16 1.25 True
4401
 
4402
  GENERATING COMBINED VISUALIZATION
4403
 
 
4421
  <div class="uv-install-logs" id="uv-logs-combine">
4422
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4423
  <div class="uv-logs-content" style="display: none;">
4424
+ Installed 37 packages in 247ms
4425
  </div>
4426
  </div>
4427
  <div class="cell-artifacts">
 
4434
  <rdf:RDF>
4435
  <ns2:Work>
4436
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4437
+ <dc:date>2025-12-19T23:02:45.375383</dc:date>
4438
  <dc:format>image/svg+xml</dc:format>
4439
  <dc:creator>
4440
  <ns2:Agent>
 
4544
  <g id="matplotlib.axis_2">
4545
  <g id="ytick_1">
4546
  <g id="grid-y--2" class="grid grid-y">
4547
+ <path d="M 47.81 407.59176 L 835.361742 407.59176 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4548
  </g>
4549
  <g id="line2d_7">
4550
  <defs>
4551
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4552
  </defs>
4553
  <g>
4554
+ <use ns4:href="#m0fca2865ba" x="47.81" y="407.59176" style="stroke: #000000; stroke-width: 0.8" />
4555
  </g>
4556
  </g>
4557
  <g id="text_7">
4558
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="411.390978" transform="rotate(-0 40.81 411.390978)">1.0</text>
4559
  </g>
4560
  </g>
4561
  <g id="ytick_2">
4562
  <g id="grid-y--3" class="grid grid-y">
4563
+ <path d="M 47.81 349.696597 L 835.361742 349.696597 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4564
  </g>
4565
  <g id="line2d_8">
4566
  <g>
4567
+ <use ns4:href="#m0fca2865ba" x="47.81" y="349.696597" style="stroke: #000000; stroke-width: 0.8" />
4568
  </g>
4569
  </g>
4570
  <g id="text_8">
4571
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="353.495815" transform="rotate(-0 40.81 353.495815)">1.2</text>
4572
  </g>
4573
  </g>
4574
  <g id="ytick_3">
4575
  <g id="grid-y--4" class="grid grid-y">
4576
+ <path d="M 47.81 291.801434 L 835.361742 291.801434 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4577
  </g>
4578
  <g id="line2d_9">
4579
  <g>
4580
+ <use ns4:href="#m0fca2865ba" x="47.81" y="291.801434" style="stroke: #000000; stroke-width: 0.8" />
4581
  </g>
4582
  </g>
4583
  <g id="text_9">
4584
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="295.600653" transform="rotate(-0 40.81 295.600653)">1.4</text>
4585
  </g>
4586
  </g>
4587
  <g id="ytick_4">
4588
  <g id="grid-y--5" class="grid grid-y">
4589
+ <path d="M 47.81 233.906271 L 835.361742 233.906271 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4590
  </g>
4591
  <g id="line2d_10">
4592
  <g>
4593
+ <use ns4:href="#m0fca2865ba" x="47.81" y="233.906271" style="stroke: #000000; stroke-width: 0.8" />
4594
  </g>
4595
  </g>
4596
  <g id="text_10">
4597
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="237.70549" transform="rotate(-0 40.81 237.70549)">1.6</text>
4598
  </g>
4599
  </g>
4600
  <g id="ytick_5">
4601
  <g id="grid-y--6" class="grid grid-y">
4602
+ <path d="M 47.81 176.011108 L 835.361742 176.011108 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4603
  </g>
4604
  <g id="line2d_11">
4605
  <g>
4606
+ <use ns4:href="#m0fca2865ba" x="47.81" y="176.011108" style="stroke: #000000; stroke-width: 0.8" />
4607
  </g>
4608
  </g>
4609
  <g id="text_11">
4610
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="179.810327" transform="rotate(-0 40.81 179.810327)">1.8</text>
4611
  </g>
4612
  </g>
4613
  <g id="ytick_6">
4614
  <g id="grid-y--7" class="grid grid-y">
4615
+ <path d="M 47.81 118.115945 L 835.361742 118.115945 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4616
  </g>
4617
  <g id="line2d_12">
4618
  <g>
4619
+ <use ns4:href="#m0fca2865ba" x="47.81" y="118.115945" style="stroke: #000000; stroke-width: 0.8" />
4620
  </g>
4621
  </g>
4622
  <g id="text_12">
4623
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="121.915164" transform="rotate(-0 40.81 121.915164)">2.0</text>
4624
  </g>
4625
  </g>
4626
  <g id="ytick_7">
4627
  <g id="grid-y--8" class="grid grid-y">
4628
+ <path d="M 47.81 60.220782 L 835.361742 60.220782 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4629
  </g>
4630
  <g id="line2d_13">
4631
  <g>
4632
+ <use ns4:href="#m0fca2865ba" x="47.81" y="60.220782" style="stroke: #000000; stroke-width: 0.8" />
4633
  </g>
4634
  </g>
4635
  <g id="text_13">
4636
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.81" y="64.020001" transform="rotate(-0 40.81 64.020001)">2.2</text>
4637
  </g>
4638
  </g>
4639
  <g id="label--y" class="ylabel">
 
4641
  </g>
4642
  </g>
4643
  <g id="series--torch-flash-ma" class="series">
4644
+ <path d="M 83.607806 345.769568 L 226.799032 327.905436 L 369.990258 321.175124 L 513.181484 310.733152 L 656.37271 266.384878 L 799.563935 257.816105 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4645
  <defs>
4646
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4647
  </defs>
4648
  <g clip-path="url(#p09feef2583)">
4649
+ <use ns4:href="#md7efaf3aec" x="83.607806" y="345.769568" style="fill: #1f77b4; stroke: #1f77b4" />
4650
+ <use ns4:href="#md7efaf3aec" x="226.799032" y="327.905436" style="fill: #1f77b4; stroke: #1f77b4" />
4651
+ <use ns4:href="#md7efaf3aec" x="369.990258" y="321.175124" style="fill: #1f77b4; stroke: #1f77b4" />
4652
+ <use ns4:href="#md7efaf3aec" x="513.181484" y="310.733152" style="fill: #1f77b4; stroke: #1f77b4" />
4653
+ <use ns4:href="#md7efaf3aec" x="656.37271" y="266.384878" style="fill: #1f77b4; stroke: #1f77b4" />
4654
+ <use ns4:href="#md7efaf3aec" x="799.563935" y="257.816105" style="fill: #1f77b4; stroke: #1f77b4" />
4655
  </g>
4656
  </g>
4657
  <g id="series--torch-mem-eff" class="series">
4658
+ <path d="M 83.607806 158.269135 L 226.799032 141.655829 L 369.990258 111.190815 L 513.181484 119.895642 L 656.37271 90.298767 L 799.563935 45.999414 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4659
  <defs>
4660
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4661
  </defs>
4662
  <g clip-path="url(#p09feef2583)">
4663
+ <use ns4:href="#m9b8c54d372" x="83.607806" y="158.269135" style="fill: #ff7f0e; stroke: #ff7f0e" />
4664
+ <use ns4:href="#m9b8c54d372" x="226.799032" y="141.655829" style="fill: #ff7f0e; stroke: #ff7f0e" />
4665
+ <use ns4:href="#m9b8c54d372" x="369.990258" y="111.190815" style="fill: #ff7f0e; stroke: #ff7f0e" />
4666
+ <use ns4:href="#m9b8c54d372" x="513.181484" y="119.895642" style="fill: #ff7f0e; stroke: #ff7f0e" />
4667
+ <use ns4:href="#m9b8c54d372" x="656.37271" y="90.298767" style="fill: #ff7f0e; stroke: #ff7f0e" />
4668
  <use ns4:href="#m9b8c54d372" x="799.563935" y="45.999414" style="fill: #ff7f0e; stroke: #ff7f0e" />
4669
  </g>
4670
  </g>
4671
  <g id="series--xformers-meff" class="series">
4672
+ <path d="M 83.607806 410.251753 L 226.799032 397.439264 L 369.990258 386.870212 L 513.181484 384.759934 L 656.37271 333.145238 L 799.563935 334.337878 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #2ca02c; stroke-width: 1.5; stroke-linecap: square" />
4673
  <defs>
4674
  <path id="mc655281e0b" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #2ca02c" />
4675
  </defs>
4676
  <g clip-path="url(#p09feef2583)">
4677
+ <use ns4:href="#mc655281e0b" x="83.607806" y="410.251753" style="fill: #2ca02c; stroke: #2ca02c" />
4678
+ <use ns4:href="#mc655281e0b" x="226.799032" y="397.439264" style="fill: #2ca02c; stroke: #2ca02c" />
4679
+ <use ns4:href="#mc655281e0b" x="369.990258" y="386.870212" style="fill: #2ca02c; stroke: #2ca02c" />
4680
+ <use ns4:href="#mc655281e0b" x="513.181484" y="384.759934" style="fill: #2ca02c; stroke: #2ca02c" />
4681
+ <use ns4:href="#mc655281e0b" x="656.37271" y="333.145238" style="fill: #2ca02c; stroke: #2ca02c" />
4682
+ <use ns4:href="#mc655281e0b" x="799.563935" y="334.337878" style="fill: #2ca02c; stroke: #2ca02c" />
4683
  </g>
4684
  </g>
4685
  <g id="series--hf-kernels-flash-attn" class="series">
4686
+ <path d="M 83.607806 417.931836 L 226.799032 403.327202 L 369.990258 389.605759 L 513.181484 383.179396 L 656.37271 337.13711 L 799.563935 336.430789 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #d62728; stroke-width: 1.5; stroke-linecap: square" />
4687
  <defs>
4688
  <path id="m61c8040d7e" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #d62728" />
4689
  </defs>
4690
  <g clip-path="url(#p09feef2583)">
4691
+ <use ns4:href="#m61c8040d7e" x="83.607806" y="417.931836" style="fill: #d62728; stroke: #d62728" />
4692
+ <use ns4:href="#m61c8040d7e" x="226.799032" y="403.327202" style="fill: #d62728; stroke: #d62728" />
4693
+ <use ns4:href="#m61c8040d7e" x="369.990258" y="389.605759" style="fill: #d62728; stroke: #d62728" />
4694
+ <use ns4:href="#m61c8040d7e" x="513.181484" y="383.179396" style="fill: #d62728; stroke: #d62728" />
4695
+ <use ns4:href="#m61c8040d7e" x="656.37271" y="337.13711" style="fill: #d62728; stroke: #d62728" />
4696
+ <use ns4:href="#m61c8040d7e" x="799.563935" y="336.430789" style="fill: #d62728; stroke: #d62728" />
4697
  </g>
4698
  </g>
4699
  <g id="series--hf-kernels-flash-attn3" class="series">
4700
+ <path d="M 83.607806 428.387702 L 226.799032 413.453355 L 369.990258 405.098794 L 513.181484 402.988515 L 656.37271 353.264386 L 799.563935 343.383998 " clip-path="url(#p09feef2583)" style="fill: none; stroke: #9467bd; stroke-width: 1.5; stroke-linecap: square" />
4701
  <defs>
4702
  <path id="m7cd35be9cc" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #9467bd" />
4703
  </defs>
4704
  <g clip-path="url(#p09feef2583)">
4705
  <use ns4:href="#m7cd35be9cc" x="83.607806" y="428.387702" style="fill: #9467bd; stroke: #9467bd" />
4706
+ <use ns4:href="#m7cd35be9cc" x="226.799032" y="413.453355" style="fill: #9467bd; stroke: #9467bd" />
4707
+ <use ns4:href="#m7cd35be9cc" x="369.990258" y="405.098794" style="fill: #9467bd; stroke: #9467bd" />
4708
+ <use ns4:href="#m7cd35be9cc" x="513.181484" y="402.988515" style="fill: #9467bd; stroke: #9467bd" />
4709
+ <use ns4:href="#m7cd35be9cc" x="656.37271" y="353.264386" style="fill: #9467bd; stroke: #9467bd" />
4710
+ <use ns4:href="#m7cd35be9cc" x="799.563935" y="343.383998" style="fill: #9467bd; stroke: #9467bd" />
4711
  </g>
4712
  </g>
4713
  <g id="patch_3">
index.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Darwin arm64 | macOS-15.7.2-arm64-arm-64bit
3878
  </div>
3879
  </div>
3880
 
layer_norm/impls/artifacts/benchmark/layer_norm.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8300210000129482, "p50": 0.8342720000200643, "p90": 0.83692099997279, "mean": 0.8337814000014987, "iqr": 0.006369000004724512, "raw_times": [0.8300210000129482, 0.83692099997279, 0.8342720000200643, 0.8305519999680655, 0.8371410000336255], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8391320000100677, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
- {"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6468019999820172, "p50": 1.6505419999930382, "p90": 1.6509619999851566, "mean": 1.650563999987753, "iqr": 0.0014700000292577897, "raw_times": [1.6509619999851566, 1.6505419999930382, 1.6468019999820172, 1.6494919999558988, 1.6550220000226545], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6566229999739335, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
- {"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.643002000037086, "p50": 1.6493729999638163, "p90": 1.6502219999665613, "mean": 1.6475743999876613, "iqr": 0.0068199999532225775, "raw_times": [1.643002000037086, 1.6434020000133387, 1.6502219999665613, 1.6493729999638163, 1.6518729999575044], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6485120000311326, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
- {"ts": "2025-12-19T19:41:22Z", "run": "e4072b52508346c79afed4185ddfbd8a", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2460340000284305, "p50": 3.2577039999637236, "p90": 3.260522999994464, "mean": 3.2551376000014898, "iqr": 0.011509999978898122, "raw_times": [3.260522999994464, 3.2460340000284305, 3.249013000015566, 3.2577039999637236, 3.262414000005265], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2401029999959974, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
 
1
+ {"ts": "2025-12-19T23:02:16Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D4096", "batch": 16, "seq_len": 2048, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8274980000351206, "p50": 0.8322979999775271, "p90": 0.8378580000680813, "mean": 0.8332618000167713, "iqr": 0.0071710001066094264, "raw_times": [0.8322979999775271, 0.8378580000680813, 0.8379680000416556, 0.8306869999614719, 0.8274980000351206], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8445380001376179, "peak_bytes": 2415935488, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
2
+ {"ts": "2025-12-19T23:02:16Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S2048_D8192", "batch": 16, "seq_len": 2048, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6395549998833303, "p50": 1.6463560000374855, "p90": 1.6514159999587719, "mean": 1.6487175999827741, "iqr": 0.00707099979990744, "raw_times": [1.6395549998833303, 1.6514159999587719, 1.6463560000374855, 1.6443450001588644, 1.6619159998754185], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.6726759999983187, "peak_bytes": 4831870976, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1086463928222656e-05, "ref": "layer_norm_ref"}, "err": null}
3
+ {"ts": "2025-12-19T23:02:16Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D4096", "batch": 16, "seq_len": 4096, "hidden_dim": 4096, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 1.6412159998253628, "p50": 1.644736000116609, "p90": 1.6461760001220682, "mean": 1.6448379999474128, "iqr": 0.0036900003124173963, "raw_times": [1.644736000116609, 1.6412159998253628, 1.649575999863373, 1.6461760001220682, 1.6424859998096508], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 1.646575999984634, "peak_bytes": 4831854592, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015411376953125, "mse": 1.1205673217773438e-05, "ref": "layer_norm_ref"}, "err": null}
4
+ {"ts": "2025-12-19T23:02:17Z", "run": "32d018bc53624a45997f9dda67216816", "impl": "hf_kernels_layer_norm", "tags": {"family": "hf-kernels", "repo": "kernels-community/layer-norm", "op": "layer_norm"}, "wl": {"name": "LN_B16_S4096_D8192", "batch": 16, "seq_len": 4096, "hidden_dim": 8192, "dtype": "bfloat16", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.2493999999587686, "p50": 3.2569499999226537, "p90": 3.2582300000285613, "mean": 3.2570102000136103, "iqr": 0.006920000032550888, "raw_times": [3.2493999999587686, 3.2691610001620575, 3.2513099999960104, 3.2569499999226537, 3.2582300000285613], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.2572910001817945, "peak_bytes": 9663709184, "ok": true, "absmax": 0.03125, "corr": {"ok": true, "rtol": 0.001, "atol": 0.03125, "absmax": 0.03125, "mae": 0.0015106201171875, "mse": 1.1026859283447266e-05, "ref": "layer_norm_ref"}, "err": null}
layer_norm/impls/hf_kernels_layer_norm.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3890
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3891
  </span> |
3892
- Cell: benchmark | 6.61s
3893
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3894
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3895
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3961,19 +3961,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D4096
3961
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3962
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
- hf_kernels_layer_norm 4.34% 182.243us 49.12% 2.065ms 2.065ms 0.000us 0.00% 3.103ms 3.103ms 1
3965
- _layer_norm_f8ec252::dropout_add_ln_fwd 1.54% 64.542us 44.23% 1.860ms 619.846us 2.366ms 100.00% 3.103ms 1.034ms 3
3966
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.367ms 100.06% 2.367ms 2.367ms 1
3967
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.366ms 100.00% 2.366ms 788.551us 3
3968
- Activity Buffer Request 40.34% 1.696ms 40.34% 1.696ms 1.696ms 737.372us 31.17% 737.372us 737.372us 1
3969
- aten::view 0.55% 23.192us 0.55% 23.192us 3.865us 0.000us 0.00% 0.000us 0.000us 6
3970
- aten::empty 1.11% 46.641us 1.11% 46.641us 5.182us 0.000us 0.00% 0.000us 0.000us 9
3971
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.21% 8.950us 0.21% 8.950us 2.983us 0.000us 0.00% 0.000us 0.000us 3
3972
- cudaLaunchKernel 1.04% 43.741us 1.04% 43.741us 14.580us 0.000us 0.00% 0.000us 0.000us 3
3973
- cudaDeviceSynchronize 50.88% 2.139ms 50.88% 2.139ms 2.139ms 0.000us 0.00% 0.000us 0.000us 1
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
- Self CPU time total: 4.204ms
3976
- Self CUDA time total: 2.366ms
3977
 
3978
 
3979
 
@@ -3983,19 +3983,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S2048_D8192
3983
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3984
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
- hf_kernels_layer_norm 2.14% 142.004us 28.99% 1.924ms 1.924ms 0.000us 0.00% 6.477ms 6.477ms 1
3987
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.66% 43.639us 26.68% 1.771ms 590.278us 4.886ms 100.00% 6.477ms 2.159ms 3
3988
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.887ms 100.03% 4.887ms 4.887ms 1
3989
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.886ms 100.00% 4.886ms 1.629ms 3
3990
- Activity Buffer Request 25.01% 1.660ms 25.01% 1.660ms 1.660ms 1.591ms 32.57% 1.591ms 1.591ms 1
3991
- aten::view 0.17% 11.341us 0.17% 11.341us 1.890us 0.000us 0.00% 0.000us 0.000us 6
3992
- aten::empty 0.47% 31.442us 0.47% 31.442us 3.494us 0.000us 0.00% 0.000us 0.000us 9
3993
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.640us 0.07% 4.640us 1.547us 0.000us 0.00% 0.000us 0.000us 3
3994
- cudaLaunchKernel 0.46% 30.730us 0.46% 30.730us 10.243us 0.000us 0.00% 0.000us 0.000us 3
3995
- cudaDeviceSynchronize 71.01% 4.714ms 71.01% 4.714ms 4.714ms 0.000us 0.00% 0.000us 0.000us 1
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
- Self CPU time total: 6.638ms
3998
- Self CUDA time total: 4.886ms
3999
 
4000
 
4001
 
@@ -4005,19 +4005,19 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D4096
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
- hf_kernels_layer_norm 1.93% 128.176us 30.23% 2.007ms 2.007ms 0.000us 0.00% 6.371ms 6.371ms 1
4009
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.67% 44.789us 28.12% 1.867ms 622.462us 4.799ms 100.00% 6.371ms 2.124ms 3
4010
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.800ms 100.03% 4.800ms 4.800ms 1
4011
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.799ms 100.00% 4.799ms 1.600ms 3
4012
- Activity Buffer Request 26.44% 1.756ms 26.44% 1.756ms 1.756ms 1.572ms 32.76% 1.572ms 1.572ms 1
4013
- aten::view 0.18% 11.888us 0.18% 11.888us 1.981us 0.000us 0.00% 0.000us 0.000us 6
4014
- aten::empty 0.47% 31.493us 0.47% 31.493us 3.499us 0.000us 0.00% 0.000us 0.000us 9
4015
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.07% 4.790us 0.07% 4.790us 1.597us 0.000us 0.00% 0.000us 0.000us 3
4016
- cudaLaunchKernel 0.46% 30.490us 0.46% 30.490us 10.163us 0.000us 0.00% 0.000us 0.000us 3
4017
- cudaDeviceSynchronize 69.77% 4.633ms 69.77% 4.633ms 4.633ms 0.000us 0.00% 0.000us 0.000us 1
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
- Self CPU time total: 6.641ms
4020
- Self CUDA time total: 4.799ms
4021
 
4022
 
4023
 
@@ -4027,40 +4027,37 @@ PROFILE TRACE: hf_kernels_layer_norm | LN_B16_S4096_D8192
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
- hf_kernels_layer_norm 1.63% 190.425us 19.77% 2.315ms 2.315ms 0.000us 0.00% 12.766ms 12.766ms 1
4031
- _layer_norm_f8ec252::dropout_add_ln_fwd 0.56% 65.132us 17.99% 2.107ms 702.188us 9.610ms 100.00% 12.766ms 4.255ms 3
4032
- hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.611ms 100.01% 9.611ms 9.611ms 1
4033
- void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.610ms 100.00% 9.610ms 3.203ms 3
4034
- Activity Buffer Request 14.43% 1.690ms 14.43% 1.690ms 1.690ms 3.156ms 32.84% 3.156ms 3.156ms 1
4035
- aten::view 0.16% 18.311us 0.16% 18.311us 3.052us 0.000us 0.00% 0.000us 0.000us 6
4036
- aten::empty 0.27% 31.990us 0.27% 31.990us 3.554us 0.000us 0.00% 0.000us 0.000us 9
4037
- cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.06% 6.981us 0.06% 6.981us 2.327us 0.000us 0.00% 0.000us 0.000us 3
4038
- cudaLaunchKernel 2.67% 312.827us 2.67% 312.827us 104.276us 0.000us 0.00% 0.000us 0.000us 3
4039
- cudaDeviceSynchronize 80.23% 9.393ms 80.23% 9.393ms 9.393ms 0.000us 0.00% 0.000us 0.000us 1
4040
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4041
- Self CPU time total: 11.708ms
4042
- Self CUDA time total: 9.610ms
4043
 
4044
 
4045
  impl wl p50(ms) ok
4046
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4047
  hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4048
- hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4049
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4050
  </pre></div>
4051
  <div class="uv-install-logs" id="uv-logs-benchmark">
4052
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4053
  <div class="uv-logs-content" style="display: none;">
4054
- Downloading hf-xet (3.2MiB)
4055
- Downloaded hf-xet
4056
  Installed 14 packages in 12ms
4057
  </div>
4058
  </div>
4059
- <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
4060
-
4061
- Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 8.01it/s]
4062
- Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.00it/s]
4063
- Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.30it/s]</div>
4064
  <div class="cell-artifacts">
4065
  <h4>Artifacts:</h4>
4066
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
 
3889
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3890
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3891
  </span> |
3892
+ Cell: benchmark | 6.38s
3893
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3894
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3895
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3961
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3962
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3963
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3964
+ hf_kernels_layer_norm 4.50% 190.523us 50.02% 2.118ms 2.118ms 0.000us 0.00% 3.104ms 3.104ms 1
3965
+ _layer_norm_f8ec252::dropout_add_ln_fwd 1.66% 70.302us 44.96% 1.904ms 634.711us 2.362ms 100.00% 3.104ms 1.035ms 3
3966
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.364ms 100.07% 2.364ms 2.364ms 1
3967
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 2.362ms 100.00% 2.362ms 787.316us 3
3968
+ Activity Buffer Request 40.99% 1.736ms 40.99% 1.736ms 1.736ms 741.567us 31.40% 741.567us 741.567us 1
3969
+ aten::view 0.56% 23.541us 0.56% 23.541us 3.923us 0.000us 0.00% 0.000us 0.000us 6
3970
+ aten::empty 1.07% 45.480us 1.07% 45.480us 5.053us 0.000us 0.00% 0.000us 0.000us 9
3971
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.24% 10.011us 0.24% 10.011us 3.337us 0.000us 0.00% 0.000us 0.000us 3
3972
+ cudaLaunchKernel 1.01% 42.571us 1.01% 42.571us 14.190us 0.000us 0.00% 0.000us 0.000us 3
3973
+ cudaDeviceSynchronize 49.98% 2.117ms 49.98% 2.117ms 2.117ms 0.000us 0.00% 0.000us 0.000us 1
3974
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3975
+ Self CPU time total: 4.235ms
3976
+ Self CUDA time total: 2.362ms
3977
 
3978
 
3979
 
 
3983
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3984
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
+ hf_kernels_layer_norm 2.21% 144.492us 28.97% 1.894ms 1.894ms 0.000us 0.00% 6.395ms 6.395ms 1
3987
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.69% 45.222us 26.58% 1.738ms 579.353us 4.814ms 100.00% 6.395ms 2.132ms 3
3988
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.816ms 100.03% 4.816ms 4.816ms 1
3989
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.814ms 100.00% 4.814ms 1.605ms 3
3990
+ Activity Buffer Request 24.92% 1.629ms 24.92% 1.629ms 1.629ms 1.581ms 32.84% 1.581ms 1.581ms 1
3991
+ aten::view 0.18% 11.541us 0.18% 11.541us 1.923us 0.000us 0.00% 0.000us 0.000us 6
3992
+ aten::empty 0.45% 29.440us 0.45% 29.440us 3.271us 0.000us 0.00% 0.000us 0.000us 9
3993
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.08% 5.060us 0.08% 5.060us 1.687us 0.000us 0.00% 0.000us 0.000us 3
3994
+ cudaLaunchKernel 0.45% 29.150us 0.45% 29.150us 9.717us 0.000us 0.00% 0.000us 0.000us 3
3995
+ cudaDeviceSynchronize 71.03% 4.644ms 71.03% 4.644ms 4.644ms 0.000us 0.00% 0.000us 0.000us 1
3996
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3997
+ Self CPU time total: 6.538ms
3998
+ Self CUDA time total: 4.814ms
3999
 
4000
 
4001
 
 
4005
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4006
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
+ hf_kernels_layer_norm 2.04% 135.241us 30.10% 1.992ms 1.992ms 0.000us 0.00% 6.361ms 6.361ms 1
4009
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.68% 45.331us 27.89% 1.846ms 615.254us 4.793ms 100.00% 6.361ms 2.120ms 3
4010
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.794ms 100.03% 4.794ms 4.794ms 1
4011
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.793ms 100.00% 4.793ms 1.598ms 3
4012
+ Activity Buffer Request 26.25% 1.737ms 26.25% 1.737ms 1.737ms 1.569ms 32.73% 1.569ms 1.569ms 1
4013
+ aten::view 0.17% 11.061us 0.17% 11.061us 1.844us 0.000us 0.00% 0.000us 0.000us 6
4014
+ aten::empty 0.44% 29.151us 0.44% 29.151us 3.239us 0.000us 0.00% 0.000us 0.000us 9
4015
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.09% 5.831us 0.09% 5.831us 1.944us 0.000us 0.00% 0.000us 0.000us 3
4016
+ cudaLaunchKernel 0.43% 28.320us 0.43% 28.320us 9.440us 0.000us 0.00% 0.000us 0.000us 3
4017
+ cudaDeviceSynchronize 69.90% 4.626ms 69.90% 4.626ms 4.626ms 0.000us 0.00% 0.000us 0.000us 1
4018
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4019
+ Self CPU time total: 6.618ms
4020
+ Self CUDA time total: 4.793ms
4021
 
4022
 
4023
 
 
4027
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4028
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
+ hf_kernels_layer_norm 1.16% 134.713us 18.89% 2.202ms 2.202ms 0.000us 0.00% 12.808ms 12.808ms 1
4031
+ _layer_norm_f8ec252::dropout_add_ln_fwd 0.38% 44.369us 17.64% 2.056ms 685.371us 9.627ms 100.00% 12.808ms 4.269ms 3
4032
+ hf_kernels_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.628ms 100.02% 9.628ms 9.628ms 1
4033
+ void layer_norm::ln_fwd_kernel&lt;layer_norm::Kernel_tr... 0.00% 0.000us 0.00% 0.000us 0.000us 9.627ms 100.00% 9.627ms 3.209ms 3
4034
+ Activity Buffer Request 14.91% 1.739ms 14.91% 1.739ms 1.739ms 3.182ms 33.05% 3.182ms 3.182ms 1
4035
+ aten::view 0.10% 11.381us 0.10% 11.381us 1.897us 0.000us 0.00% 0.000us 0.000us 6
4036
+ aten::empty 0.26% 29.940us 0.26% 29.940us 3.327us 0.000us 0.00% 0.000us 0.000us 9
4037
+ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFla... 0.04% 4.960us 0.04% 4.960us 1.653us 0.000us 0.00% 0.000us 0.000us 3
4038
+ cudaLaunchKernel 2.04% 237.996us 2.04% 237.996us 79.332us 0.000us 0.00% 0.000us 0.000us 3
4039
+ cudaDeviceSynchronize 81.11% 9.457ms 81.11% 9.457ms 9.457ms 0.000us 0.00% 0.000us 0.000us 1
4040
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4041
+ Self CPU time total: 11.659ms
4042
+ Self CUDA time total: 9.627ms
4043
 
4044
 
4045
  impl wl p50(ms) ok
4046
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4047
  hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4048
+ hf_kernels_layer_norm LN_B16_S4096_D4096 1.64 True
4049
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4050
  </pre></div>
4051
  <div class="uv-install-logs" id="uv-logs-benchmark">
4052
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4053
  <div class="uv-logs-content" style="display: none;">
 
 
4054
  Installed 14 packages in 12ms
4055
  </div>
4056
  </div>
4057
+ <div class="cell-stderr">Fetching 4 files: 0%| | 0/4 [00:00&lt;?, ?it/s]
4058
+ Fetching 4 files: 25%|██▌ | 1/4 [00:00&lt;00:00, 9.25it/s]
4059
+ Fetching 4 files: 50%|█████ | 2/4 [00:01&lt;00:01, 1.14it/s]
4060
+ Fetching 4 files: 100%|██████████| 4/4 [00:01&lt;00:00, 2.62it/s]</div>
 
4061
  <div class="cell-artifacts">
4062
  <h4>Artifacts:</h4>
4063
  <a href="artifacts/benchmark/layer_norm.jsonl" class="artifact" target="_blank">layer_norm.jsonl</a>
layer_norm/impls/torch_layer_norm.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.30s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,7 +3904,7 @@ Cell: nv | 0.30s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:40:36 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.30s
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 26C P8 24W / 350W | 0MiB / 46068MiB | 0% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.30s
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 32.13s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -3985,19 +3985,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D4096
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
- torch_layer_norm 2.46% 151.464us 66.01% 4.061ms 4.061ms 0.000us 0.00% 3.020ms 3.020ms 1
3989
- aten::layer_norm 0.24% 14.681us 63.55% 3.910ms 1.303ms 0.000us 0.00% 3.020ms 1.007ms 3
3990
- aten::native_layer_norm 20.97% 1.290ms 63.31% 3.895ms 1.298ms 2.310ms 100.00% 3.020ms 1.007ms 3
3991
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.311ms 100.06% 2.311ms 2.311ms 1
3992
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.310ms 100.00% 2.310ms 770.057us 3
3993
- Activity Buffer Request 40.34% 2.482ms 40.34% 2.482ms 2.482ms 709.854us 30.73% 709.854us 709.854us 1
3994
- aten::empty 1.09% 66.873us 1.09% 66.873us 7.430us 0.000us 0.00% 0.000us 0.000us 9
3995
- cudaLaunchKernel 0.79% 48.731us 0.79% 48.731us 16.244us 0.000us 0.00% 0.000us 0.000us 3
3996
- aten::view 0.12% 7.460us 0.12% 7.460us 1.243us 0.000us 0.00% 0.000us 0.000us 6
3997
- cudaDeviceSynchronize 33.99% 2.091ms 33.99% 2.091ms 2.091ms 0.000us 0.00% 0.000us 0.000us 1
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
- Self CPU time total: 6.152ms
4000
- Self CUDA time total: 2.310ms
4001
 
4002
 
4003
 
@@ -4007,19 +4007,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S2048_D8192
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4009
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
- torch_layer_norm 1.07% 70.812us 28.19% 1.857ms 1.857ms 0.000us 0.00% 6.442ms 6.442ms 1
4011
- aten::layer_norm 0.14% 9.000us 27.11% 1.786ms 595.403us 0.000us 0.00% 6.442ms 2.147ms 3
4012
- aten::native_layer_norm 0.75% 49.502us 26.98% 1.777ms 592.403us 4.862ms 100.00% 6.442ms 2.147ms 3
4013
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.864ms 100.03% 4.864ms 4.864ms 1
4014
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.862ms 100.00% 4.862ms 1.621ms 3
4015
- Activity Buffer Request 25.31% 1.667ms 25.31% 1.667ms 1.667ms 1.580ms 32.49% 1.580ms 1.580ms 1
4016
- aten::empty 0.43% 28.150us 0.43% 28.150us 3.128us 0.000us 0.00% 0.000us 0.000us 9
4017
- cudaLaunchKernel 0.44% 28.800us 0.44% 28.800us 9.600us 0.000us 0.00% 0.000us 0.000us 3
4018
- aten::view 0.06% 3.751us 0.06% 3.751us 0.625us 0.000us 0.00% 0.000us 0.000us 6
4019
- cudaDeviceSynchronize 71.81% 4.731ms 71.81% 4.731ms 4.731ms 0.000us 0.00% 0.000us 0.000us 1
4020
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4021
- Self CPU time total: 6.588ms
4022
- Self CUDA time total: 4.862ms
4023
 
4024
 
4025
 
@@ -4029,19 +4029,19 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D4096
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4031
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4032
- torch_layer_norm 1.08% 70.451us 29.89% 1.957ms 1.957ms 0.000us 0.00% 6.239ms 6.239ms 1
4033
- aten::layer_norm 0.13% 8.611us 28.81% 1.886ms 628.738us 0.000us 0.00% 6.239ms 2.080ms 3
4034
- aten::native_layer_norm 0.76% 49.870us 28.68% 1.878ms 625.867us 4.724ms 100.00% 6.239ms 2.080ms 3
4035
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.726ms 100.03% 4.726ms 4.726ms 1
4036
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.724ms 100.00% 4.724ms 1.575ms 3
4037
- Activity Buffer Request 26.98% 1.766ms 26.98% 1.766ms 1.766ms 1.515ms 32.08% 1.515ms 1.515ms 1
4038
- aten::empty 0.45% 29.490us 0.45% 29.490us 3.277us 0.000us 0.00% 0.000us 0.000us 9
4039
- cudaLaunchKernel 0.43% 27.941us 0.43% 27.941us 9.314us 0.000us 0.00% 0.000us 0.000us 3
4040
- aten::view 0.06% 4.101us 0.06% 4.101us 0.684us 0.000us 0.00% 0.000us 0.000us 6
4041
- cudaDeviceSynchronize 70.11% 4.590ms 70.11% 4.590ms 4.590ms 0.000us 0.00% 0.000us 0.000us 1
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
- Self CPU time total: 6.547ms
4044
- Self CUDA time total: 4.724ms
4045
 
4046
 
4047
 
@@ -4051,23 +4051,23 @@ PROFILE TRACE: torch_layer_norm | LN_B16_S4096_D8192
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
- torch_layer_norm 0.65% 74.391us 15.11% 1.731ms 1.731ms 0.000us 0.00% 13.123ms 13.123ms 1
4055
- aten::layer_norm 0.08% 9.310us 14.46% 1.656ms 552.093us 0.000us 0.00% 13.123ms 4.374ms 3
4056
- aten::native_layer_norm 0.45% 52.052us 14.38% 1.647ms 548.989us 9.864ms 100.00% 13.123ms 4.374ms 3
4057
- torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.866ms 100.01% 9.866ms 9.866ms 1
4058
- void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.864ms 100.00% 9.864ms 3.288ms 3
4059
- Activity Buffer Request 11.61% 1.330ms 11.61% 1.330ms 1.330ms 3.258ms 33.03% 3.258ms 3.258ms 1
4060
- aten::empty 0.27% 31.120us 0.27% 31.120us 3.458us 0.000us 0.00% 0.000us 0.000us 9
4061
- cudaLaunchKernel 2.01% 229.635us 2.01% 229.635us 76.545us 0.000us 0.00% 0.000us 0.000us 3
4062
- aten::view 0.04% 4.651us 0.04% 4.651us 0.775us 0.000us 0.00% 0.000us 0.000us 6
4063
- cudaDeviceSynchronize 84.89% 9.721ms 84.89% 9.721ms 9.721ms 0.000us 0.00% 0.000us 0.000us 1
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
- Self CPU time total: 11.451ms
4066
- Self CUDA time total: 9.864ms
4067
 
4068
 
4069
  impl wl p50(ms) ok
4070
- torch_layer_norm LN_B16_S2048_D4096 0.81 True
4071
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4072
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4073
  torch_layer_norm LN_B16_S4096_D8192 3.32 True
@@ -4075,53 +4075,7 @@ torch_layer_norm LN_B16_S4096_D8192 3.32 True
4075
  <div class="uv-install-logs" id="uv-logs-benchmark">
4076
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4077
  <div class="uv-logs-content" style="display: none;">
4078
- Building kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4079
- Downloading sympy (6.0MiB)
4080
- Downloading nvidia-cusparse-cu12 (274.9MiB)
4081
- Downloading nvidia-cuda-cupti-cu12 (9.8MiB)
4082
- Downloading nvidia-cublas-cu12 (566.8MiB)
4083
- Downloading nvidia-curand-cu12 (60.7MiB)
4084
- Downloading nvidia-nccl-cu12 (307.4MiB)
4085
- Downloading nvidia-cufft-cu12 (184.2MiB)
4086
- Downloading nvidia-cufile-cu12 (1.1MiB)
4087
- Downloading nvidia-cusolver-cu12 (255.1MiB)
4088
- Downloading numpy (16.1MiB)
4089
- Downloading setuptools (1.1MiB)
4090
- Downloading nvidia-cuda-nvrtc-cu12 (84.0MiB)
4091
- Downloading kiwisolver (1.4MiB)
4092
- Downloading nvidia-cusparselt-cu12 (273.9MiB)
4093
- Downloading matplotlib (8.3MiB)
4094
- Downloading nvidia-nvjitlink-cu12 (37.4MiB)
4095
- Downloading nvidia-cudnn-cu12 (674.0MiB)
4096
- Downloading fonttools (4.8MiB)
4097
- Downloading pillow (6.7MiB)
4098
- Downloading networkx (2.0MiB)
4099
- Downloading torch (846.9MiB)
4100
- Downloading triton (148.3MiB)
4101
- Downloaded nvidia-cufile-cu12
4102
- Downloaded kiwisolver
4103
- Downloaded setuptools
4104
- Downloaded networkx
4105
- Downloaded fonttools
4106
- Downloaded pillow
4107
- Built kernels-benchmark-tools @ file:///__w/kernels-benchmarks/kernels-benchmarks/tools
4108
- Downloaded nvidia-cuda-cupti-cu12
4109
- Downloaded matplotlib
4110
- Downloaded numpy
4111
- Downloaded sympy
4112
- Downloaded nvidia-nvjitlink-cu12
4113
- Downloaded nvidia-curand-cu12
4114
- Downloaded nvidia-cuda-nvrtc-cu12
4115
- Downloaded triton
4116
- Downloaded nvidia-cufft-cu12
4117
- Downloaded nvidia-cusolver-cu12
4118
- Downloaded nvidia-cusparselt-cu12
4119
- Downloaded nvidia-cusparse-cu12
4120
- Downloaded nvidia-nccl-cu12
4121
- Downloaded nvidia-cublas-cu12
4122
- Downloaded nvidia-cudnn-cu12
4123
- Downloaded torch
4124
- Installed 37 packages in 284ms
4125
  </div>
4126
  </div>
4127
  <div class="cell-artifacts">
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 22:48:33 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
 
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 30C P0 107W / 350W | 0MiB / 46068MiB | 68% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 7.61s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
3985
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3986
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
3987
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3988
+ torch_layer_norm 3.61% 151.022us 49.72% 2.081ms 2.081ms 0.000us 0.00% 3.037ms 3.037ms 1
3989
+ aten::layer_norm 0.35% 14.701us 46.11% 1.930ms 643.468us 0.000us 0.00% 3.037ms 1.012ms 3
3990
+ aten::native_layer_norm 1.79% 75.131us 45.76% 1.916ms 638.567us 2.326ms 100.00% 3.037ms 1.012ms 3
3991
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 2.327ms 100.06% 2.327ms 2.327ms 1
3992
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 2.326ms 100.00% 2.326ms 775.187us 3
3993
+ Activity Buffer Request 41.50% 1.738ms 41.50% 1.738ms 1.738ms 711.774us 30.61% 711.774us 711.774us 1
3994
+ aten::empty 1.17% 48.860us 1.17% 48.860us 5.429us 0.000us 0.00% 0.000us 0.000us 9
3995
+ cudaLaunchKernel 1.12% 46.753us 1.12% 46.753us 15.584us 0.000us 0.00% 0.000us 0.000us 3
3996
+ aten::view 0.18% 7.441us 0.18% 7.441us 1.240us 0.000us 0.00% 0.000us 0.000us 6
3997
+ cudaDeviceSynchronize 50.28% 2.105ms 50.28% 2.105ms 2.105ms 0.000us 0.00% 0.000us 0.000us 1
3998
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
3999
+ Self CPU time total: 4.186ms
4000
+ Self CUDA time total: 2.326ms
4001
 
4002
 
4003
 
 
4007
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4008
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4009
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4010
+ torch_layer_norm 1.05% 69.561us 28.39% 1.886ms 1.886ms 0.000us 0.00% 6.477ms 6.477ms 1
4011
+ aten::layer_norm 0.13% 8.670us 27.34% 1.816ms 605.463us 0.000us 0.00% 6.477ms 2.159ms 3
4012
+ aten::native_layer_norm 0.77% 50.957us 27.21% 1.808ms 602.573us 4.891ms 100.00% 6.477ms 2.159ms 3
4013
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.893ms 100.03% 4.893ms 4.893ms 1
4014
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.891ms 100.00% 4.891ms 1.630ms 3
4015
+ Activity Buffer Request 25.53% 1.696ms 25.53% 1.696ms 1.696ms 1.586ms 32.42% 1.586ms 1.586ms 1
4016
+ aten::empty 0.45% 29.753us 0.45% 29.753us 3.306us 0.000us 0.00% 0.000us 0.000us 9
4017
+ cudaLaunchKernel 0.41% 27.542us 0.41% 27.542us 9.181us 0.000us 0.00% 0.000us 0.000us 3
4018
+ aten::view 0.05% 3.522us 0.05% 3.522us 0.587us 0.000us 0.00% 0.000us 0.000us 6
4019
+ cudaDeviceSynchronize 71.61% 4.758ms 71.61% 4.758ms 4.758ms 0.000us 0.00% 0.000us 0.000us 1
4020
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4021
+ Self CPU time total: 6.643ms
4022
+ Self CUDA time total: 4.891ms
4023
 
4024
 
4025
 
 
4029
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4030
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4031
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4032
+ torch_layer_norm 1.06% 68.562us 29.18% 1.889ms 1.889ms 0.000us 0.00% 6.234ms 6.234ms 1
4033
+ aten::layer_norm 0.14% 9.330us 28.12% 1.821ms 606.966us 0.000us 0.00% 6.234ms 2.078ms 3
4034
+ aten::native_layer_norm 0.78% 50.590us 27.97% 1.812ms 603.856us 4.719ms 100.00% 6.234ms 2.078ms 3
4035
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 4.721ms 100.03% 4.721ms 4.721ms 1
4036
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 4.719ms 100.00% 4.719ms 1.573ms 3
4037
+ Activity Buffer Request 26.26% 1.700ms 26.26% 1.700ms 1.700ms 1.515ms 32.11% 1.515ms 1.515ms 1
4038
+ aten::empty 0.44% 28.660us 0.44% 28.660us 3.184us 0.000us 0.00% 0.000us 0.000us 9
4039
+ cudaLaunchKernel 0.43% 28.042us 0.43% 28.042us 9.347us 0.000us 0.00% 0.000us 0.000us 3
4040
+ aten::view 0.06% 3.840us 0.06% 3.840us 0.640us 0.000us 0.00% 0.000us 0.000us 6
4041
+ cudaDeviceSynchronize 70.82% 4.586ms 70.82% 4.586ms 4.586ms 0.000us 0.00% 0.000us 0.000us 1
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
+ Self CPU time total: 6.476ms
4044
+ Self CUDA time total: 4.719ms
4045
 
4046
 
4047
 
 
4051
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4052
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4053
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4054
+ torch_layer_norm 0.64% 72.823us 14.96% 1.710ms 1.710ms 0.000us 0.00% 13.144ms 13.144ms 1
4055
+ aten::layer_norm 0.08% 8.940us 14.32% 1.637ms 545.678us 0.000us 0.00% 13.144ms 4.381ms 3
4056
+ aten::native_layer_norm 0.49% 56.431us 14.24% 1.628ms 542.698us 9.871ms 100.00% 13.144ms 4.381ms 3
4057
+ torch_layer_norm 0.00% 0.000us 0.00% 0.000us 0.000us 9.872ms 100.02% 9.872ms 9.872ms 1
4058
+ void at::native::(anonymous namespace)::vectorized_l... 0.00% 0.000us 0.00% 0.000us 0.000us 9.871ms 100.00% 9.871ms 3.290ms 3
4059
+ Activity Buffer Request 11.76% 1.344ms 11.76% 1.344ms 1.344ms 3.273ms 33.16% 3.273ms 3.273ms 1
4060
+ aten::empty 0.26% 29.920us 0.26% 29.920us 3.324us 0.000us 0.00% 0.000us 0.000us 9
4061
+ cudaLaunchKernel 1.69% 193.294us 1.69% 193.294us 64.431us 0.000us 0.00% 0.000us 0.000us 3
4062
+ aten::view 0.04% 4.390us 0.04% 4.390us 0.732us 0.000us 0.00% 0.000us 0.000us 6
4063
+ cudaDeviceSynchronize 85.04% 9.722ms 85.04% 9.722ms 9.722ms 0.000us 0.00% 0.000us 0.000us 1
4064
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4065
+ Self CPU time total: 11.432ms
4066
+ Self CUDA time total: 9.871ms
4067
 
4068
 
4069
  impl wl p50(ms) ok
4070
+ torch_layer_norm LN_B16_S2048_D4096 0.82 True
4071
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4072
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4073
  torch_layer_norm LN_B16_S4096_D8192 3.32 True
 
4075
  <div class="uv-install-logs" id="uv-logs-benchmark">
4076
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4077
  <div class="uv-logs-content" style="display: none;">
4078
+ Installed 37 packages in 201ms
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4079
  </div>
4080
  </div>
4081
  <div class="cell-artifacts">
layer_norm/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: fa76da3cc0e8c6ec848648e3fa2d66315df4d6c7779fd0c7e2825d697af78f88
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB

Git LFS Details

  • SHA256: a50c3bf38dd2b9e606d91842d6e16e6ad0750d6688e7a674eada63b34b0c39ce
  • Pointer size: 130 Bytes
  • Size of remote file: 14.6 kB
layer_norm/results/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:55:25.441156</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -3973,70 +3973,70 @@ body[data-tool="eraser"] .main-content {
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
- <path d="M 47.72 408.405291 L 840.20233 408.405291 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
- <use ns4:href="#m0fca2865ba" x="47.72" y="408.405291" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.204509" transform="rotate(-0 40.72 412.204509)">1.0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
- <path d="M 47.72 330.385445 L 840.20233 330.385445 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
- <use ns4:href="#m0fca2865ba" x="47.72" y="330.385445" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.184664" transform="rotate(-0 40.72 334.184664)">1.5</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
- <path d="M 47.72 252.3656 L 840.20233 252.3656 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
- <use ns4:href="#m0fca2865ba" x="47.72" y="252.3656" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.164819" transform="rotate(-0 40.72 256.164819)">2.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
- <path d="M 47.72 174.345754 L 840.20233 174.345754 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
- <use ns4:href="#m0fca2865ba" x="47.72" y="174.345754" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.144973" transform="rotate(-0 40.72 178.144973)">2.5</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
- <path d="M 47.72 96.325909 L 840.20233 96.325909 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="47.72" y="96.325909" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.125128" transform="rotate(-0 40.72 100.125128)">3.0</text>
4040
  </g>
4041
  </g>
4042
  <g id="label--y" class="ylabel">
@@ -4044,27 +4044,27 @@ body[data-tool="eraser"] .main-content {
4044
  </g>
4045
  </g>
4046
  <g id="series--torch-layer-norm" class="series">
4047
- <path d="M 83.741924 437.689571 L 323.888085 302.503181 L 564.034245 313.112476 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4048
  <defs>
4049
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4050
  </defs>
4051
  <g clip-path="url(#p2214f54723)">
4052
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4053
- <use ns4:href="#md7efaf3aec" x="323.888085" y="302.503181" style="fill: #1f77b4; stroke: #1f77b4" />
4054
- <use ns4:href="#md7efaf3aec" x="564.034245" y="313.112476" style="fill: #1f77b4; stroke: #1f77b4" />
4055
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4056
  </g>
4057
  </g>
4058
  <g id="series--hf-kernels-layer-norm" class="series">
4059
- <path d="M 83.741924 434.265436 L 323.888085 306.894918 L 564.034245 307.077328 L 804.180406 56.113857 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4060
  <defs>
4061
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4062
  </defs>
4063
  <g clip-path="url(#p2214f54723)">
4064
- <use ns4:href="#m9b8c54d372" x="83.741924" y="434.265436" style="fill: #ff7f0e; stroke: #ff7f0e" />
4065
- <use ns4:href="#m9b8c54d372" x="323.888085" y="306.894918" style="fill: #ff7f0e; stroke: #ff7f0e" />
4066
- <use ns4:href="#m9b8c54d372" x="564.034245" y="307.077328" style="fill: #ff7f0e; stroke: #ff7f0e" />
4067
- <use ns4:href="#m9b8c54d372" x="804.180406" y="56.113857" style="fill: #ff7f0e; stroke: #ff7f0e" />
4068
  </g>
4069
  </g>
4070
  <g id="patch_3">
@@ -4122,7 +4122,7 @@ body[data-tool="eraser"] .main-content {
4122
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4123
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4124
  </span> |
4125
- Cell: combine | 4.63s
4126
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4127
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4128
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4211,9 +4211,9 @@ COMBINED BENCHMARK SUMMARY
4211
  impl wl p50(ms) ok
4212
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4213
  hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4214
- hf_kernels_layer_norm LN_B16_S4096_D4096 1.65 True
4215
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4216
- torch_layer_norm LN_B16_S2048_D4096 0.81 True
4217
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4218
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4219
  torch_layer_norm LN_B16_S4096_D8192 3.32 True
@@ -4236,7 +4236,7 @@ Implementations included:
4236
  <div class="uv-install-logs" id="uv-logs-combine">
4237
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4238
  <div class="uv-logs-content" style="display: none;">
4239
- Installed 37 packages in 299ms
4240
  </div>
4241
  </div>
4242
  <div class="cell-artifacts">
@@ -4249,7 +4249,7 @@ Installed 37 packages in 299ms
4249
  <rdf:RDF>
4250
  <ns2:Work>
4251
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4252
- <dc:date>2025-12-19T19:55:25.441156</dc:date>
4253
  <dc:format>image/svg+xml</dc:format>
4254
  <dc:creator>
4255
  <ns2:Agent>
@@ -4333,70 +4333,70 @@ Installed 37 packages in 299ms
4333
  <g id="matplotlib.axis_2">
4334
  <g id="ytick_1">
4335
  <g id="grid-y--2" class="grid grid-y">
4336
- <path d="M 47.72 408.405291 L 840.20233 408.405291 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4337
  </g>
4338
  <g id="line2d_5">
4339
  <defs>
4340
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4341
  </defs>
4342
  <g>
4343
- <use ns4:href="#m0fca2865ba" x="47.72" y="408.405291" style="stroke: #000000; stroke-width: 0.8" />
4344
  </g>
4345
  </g>
4346
  <g id="text_5">
4347
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="412.204509" transform="rotate(-0 40.72 412.204509)">1.0</text>
4348
  </g>
4349
  </g>
4350
  <g id="ytick_2">
4351
  <g id="grid-y--3" class="grid grid-y">
4352
- <path d="M 47.72 330.385445 L 840.20233 330.385445 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4353
  </g>
4354
  <g id="line2d_6">
4355
  <g>
4356
- <use ns4:href="#m0fca2865ba" x="47.72" y="330.385445" style="stroke: #000000; stroke-width: 0.8" />
4357
  </g>
4358
  </g>
4359
  <g id="text_6">
4360
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.184664" transform="rotate(-0 40.72 334.184664)">1.5</text>
4361
  </g>
4362
  </g>
4363
  <g id="ytick_3">
4364
  <g id="grid-y--4" class="grid grid-y">
4365
- <path d="M 47.72 252.3656 L 840.20233 252.3656 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4366
  </g>
4367
  <g id="line2d_7">
4368
  <g>
4369
- <use ns4:href="#m0fca2865ba" x="47.72" y="252.3656" style="stroke: #000000; stroke-width: 0.8" />
4370
  </g>
4371
  </g>
4372
  <g id="text_7">
4373
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.164819" transform="rotate(-0 40.72 256.164819)">2.0</text>
4374
  </g>
4375
  </g>
4376
  <g id="ytick_4">
4377
  <g id="grid-y--5" class="grid grid-y">
4378
- <path d="M 47.72 174.345754 L 840.20233 174.345754 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4379
  </g>
4380
  <g id="line2d_8">
4381
  <g>
4382
- <use ns4:href="#m0fca2865ba" x="47.72" y="174.345754" style="stroke: #000000; stroke-width: 0.8" />
4383
  </g>
4384
  </g>
4385
  <g id="text_8">
4386
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.144973" transform="rotate(-0 40.72 178.144973)">2.5</text>
4387
  </g>
4388
  </g>
4389
  <g id="ytick_5">
4390
  <g id="grid-y--6" class="grid grid-y">
4391
- <path d="M 47.72 96.325909 L 840.20233 96.325909 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4392
  </g>
4393
  <g id="line2d_9">
4394
  <g>
4395
- <use ns4:href="#m0fca2865ba" x="47.72" y="96.325909" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_9">
4399
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.125128" transform="rotate(-0 40.72 100.125128)">3.0</text>
4400
  </g>
4401
  </g>
4402
  <g id="label--y" class="ylabel">
@@ -4404,27 +4404,27 @@ Installed 37 packages in 299ms
4404
  </g>
4405
  </g>
4406
  <g id="series--torch-layer-norm" class="series">
4407
- <path d="M 83.741924 437.689571 L 323.888085 302.503181 L 564.034245 313.112476 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4408
  <defs>
4409
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4410
  </defs>
4411
  <g clip-path="url(#p2214f54723)">
4412
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4413
- <use ns4:href="#md7efaf3aec" x="323.888085" y="302.503181" style="fill: #1f77b4; stroke: #1f77b4" />
4414
- <use ns4:href="#md7efaf3aec" x="564.034245" y="313.112476" style="fill: #1f77b4; stroke: #1f77b4" />
4415
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4416
  </g>
4417
  </g>
4418
  <g id="series--hf-kernels-layer-norm" class="series">
4419
- <path d="M 83.741924 434.265436 L 323.888085 306.894918 L 564.034245 307.077328 L 804.180406 56.113857 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4420
  <defs>
4421
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4422
  </defs>
4423
  <g clip-path="url(#p2214f54723)">
4424
- <use ns4:href="#m9b8c54d372" x="83.741924" y="434.265436" style="fill: #ff7f0e; stroke: #ff7f0e" />
4425
- <use ns4:href="#m9b8c54d372" x="323.888085" y="306.894918" style="fill: #ff7f0e; stroke: #ff7f0e" />
4426
- <use ns4:href="#m9b8c54d372" x="564.034245" y="307.077328" style="fill: #ff7f0e; stroke: #ff7f0e" />
4427
- <use ns4:href="#m9b8c54d372" x="804.180406" y="56.113857" style="fill: #ff7f0e; stroke: #ff7f0e" />
4428
  </g>
4429
  </g>
4430
  <g id="patch_3">
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T23:02:49.888978</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
3973
  <g id="matplotlib.axis_2">
3974
  <g id="ytick_1">
3975
  <g id="grid-y--2" class="grid grid-y">
3976
+ <path d="M 47.72 409.375905 L 840.20233 409.375905 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3977
  </g>
3978
  <g id="line2d_5">
3979
  <defs>
3980
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
3981
  </defs>
3982
  <g>
3983
+ <use ns4:href="#m0fca2865ba" x="47.72" y="409.375905" style="stroke: #000000; stroke-width: 0.8" />
3984
  </g>
3985
  </g>
3986
  <g id="text_5">
3987
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.175123" transform="rotate(-0 40.72 413.175123)">1.0</text>
3988
  </g>
3989
  </g>
3990
  <g id="ytick_2">
3991
  <g id="grid-y--3" class="grid grid-y">
3992
+ <path d="M 47.72 331.097781 L 840.20233 331.097781 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3993
  </g>
3994
  <g id="line2d_6">
3995
  <g>
3996
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.097781" style="stroke: #000000; stroke-width: 0.8" />
3997
  </g>
3998
  </g>
3999
  <g id="text_6">
4000
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.897" transform="rotate(-0 40.72 334.897)">1.5</text>
4001
  </g>
4002
  </g>
4003
  <g id="ytick_3">
4004
  <g id="grid-y--4" class="grid grid-y">
4005
+ <path d="M 47.72 252.819658 L 840.20233 252.819658 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4006
  </g>
4007
  <g id="line2d_7">
4008
  <g>
4009
+ <use ns4:href="#m0fca2865ba" x="47.72" y="252.819658" style="stroke: #000000; stroke-width: 0.8" />
4010
  </g>
4011
  </g>
4012
  <g id="text_7">
4013
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.618877" transform="rotate(-0 40.72 256.618877)">2.0</text>
4014
  </g>
4015
  </g>
4016
  <g id="ytick_4">
4017
  <g id="grid-y--5" class="grid grid-y">
4018
+ <path d="M 47.72 174.541535 L 840.20233 174.541535 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4019
  </g>
4020
  <g id="line2d_8">
4021
  <g>
4022
+ <use ns4:href="#m0fca2865ba" x="47.72" y="174.541535" style="stroke: #000000; stroke-width: 0.8" />
4023
  </g>
4024
  </g>
4025
  <g id="text_8">
4026
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.340753" transform="rotate(-0 40.72 178.340753)">2.5</text>
4027
  </g>
4028
  </g>
4029
  <g id="ytick_5">
4030
  <g id="grid-y--6" class="grid grid-y">
4031
+ <path d="M 47.72 96.263411 L 840.20233 96.263411 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4032
  </g>
4033
  <g id="line2d_9">
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="47.72" y="96.263411" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.06263" transform="rotate(-0 40.72 100.06263)">3.0</text>
4040
  </g>
4041
  </g>
4042
  <g id="label--y" class="ylabel">
 
4044
  </g>
4045
  </g>
4046
  <g id="series--torch-layer-norm" class="series">
4047
+ <path d="M 83.741924 437.689571 L 323.888085 303.198519 L 564.034245 314.331547 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4048
  <defs>
4049
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4050
  </defs>
4051
  <g clip-path="url(#p2214f54723)">
4052
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4053
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="303.198519" style="fill: #1f77b4; stroke: #1f77b4" />
4054
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.331547" style="fill: #1f77b4; stroke: #1f77b4" />
4055
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4056
  </g>
4057
  </g>
4058
  <g id="series--hf-kernels-layer-norm" class="series">
4059
+ <path d="M 83.741924 435.6307 L 323.888085 308.184835 L 564.034245 308.438456 L 804.180406 56.036284 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4060
  <defs>
4061
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4062
  </defs>
4063
  <g clip-path="url(#p2214f54723)">
4064
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="435.6307" style="fill: #ff7f0e; stroke: #ff7f0e" />
4065
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="308.184835" style="fill: #ff7f0e; stroke: #ff7f0e" />
4066
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="308.438456" style="fill: #ff7f0e; stroke: #ff7f0e" />
4067
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="56.036284" style="fill: #ff7f0e; stroke: #ff7f0e" />
4068
  </g>
4069
  </g>
4070
  <g id="patch_3">
 
4122
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4123
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4124
  </span> |
4125
+ Cell: combine | 4.43s
4126
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4127
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4128
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4211
  impl wl p50(ms) ok
4212
  hf_kernels_layer_norm LN_B16_S2048_D4096 0.83 True
4213
  hf_kernels_layer_norm LN_B16_S2048_D8192 1.65 True
4214
+ hf_kernels_layer_norm LN_B16_S4096_D4096 1.64 True
4215
  hf_kernels_layer_norm LN_B16_S4096_D8192 3.26 True
4216
+ torch_layer_norm LN_B16_S2048_D4096 0.82 True
4217
  torch_layer_norm LN_B16_S2048_D8192 1.68 True
4218
  torch_layer_norm LN_B16_S4096_D4096 1.61 True
4219
  torch_layer_norm LN_B16_S4096_D8192 3.32 True
 
4236
  <div class="uv-install-logs" id="uv-logs-combine">
4237
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4238
  <div class="uv-logs-content" style="display: none;">
4239
+ Installed 37 packages in 283ms
4240
  </div>
4241
  </div>
4242
  <div class="cell-artifacts">
 
4249
  <rdf:RDF>
4250
  <ns2:Work>
4251
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4252
+ <dc:date>2025-12-19T23:02:49.888978</dc:date>
4253
  <dc:format>image/svg+xml</dc:format>
4254
  <dc:creator>
4255
  <ns2:Agent>
 
4333
  <g id="matplotlib.axis_2">
4334
  <g id="ytick_1">
4335
  <g id="grid-y--2" class="grid grid-y">
4336
+ <path d="M 47.72 409.375905 L 840.20233 409.375905 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4337
  </g>
4338
  <g id="line2d_5">
4339
  <defs>
4340
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4341
  </defs>
4342
  <g>
4343
+ <use ns4:href="#m0fca2865ba" x="47.72" y="409.375905" style="stroke: #000000; stroke-width: 0.8" />
4344
  </g>
4345
  </g>
4346
  <g id="text_5">
4347
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="413.175123" transform="rotate(-0 40.72 413.175123)">1.0</text>
4348
  </g>
4349
  </g>
4350
  <g id="ytick_2">
4351
  <g id="grid-y--3" class="grid grid-y">
4352
+ <path d="M 47.72 331.097781 L 840.20233 331.097781 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4353
  </g>
4354
  <g id="line2d_6">
4355
  <g>
4356
+ <use ns4:href="#m0fca2865ba" x="47.72" y="331.097781" style="stroke: #000000; stroke-width: 0.8" />
4357
  </g>
4358
  </g>
4359
  <g id="text_6">
4360
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="334.897" transform="rotate(-0 40.72 334.897)">1.5</text>
4361
  </g>
4362
  </g>
4363
  <g id="ytick_3">
4364
  <g id="grid-y--4" class="grid grid-y">
4365
+ <path d="M 47.72 252.819658 L 840.20233 252.819658 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4366
  </g>
4367
  <g id="line2d_7">
4368
  <g>
4369
+ <use ns4:href="#m0fca2865ba" x="47.72" y="252.819658" style="stroke: #000000; stroke-width: 0.8" />
4370
  </g>
4371
  </g>
4372
  <g id="text_7">
4373
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="256.618877" transform="rotate(-0 40.72 256.618877)">2.0</text>
4374
  </g>
4375
  </g>
4376
  <g id="ytick_4">
4377
  <g id="grid-y--5" class="grid grid-y">
4378
+ <path d="M 47.72 174.541535 L 840.20233 174.541535 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4379
  </g>
4380
  <g id="line2d_8">
4381
  <g>
4382
+ <use ns4:href="#m0fca2865ba" x="47.72" y="174.541535" style="stroke: #000000; stroke-width: 0.8" />
4383
  </g>
4384
  </g>
4385
  <g id="text_8">
4386
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="178.340753" transform="rotate(-0 40.72 178.340753)">2.5</text>
4387
  </g>
4388
  </g>
4389
  <g id="ytick_5">
4390
  <g id="grid-y--6" class="grid grid-y">
4391
+ <path d="M 47.72 96.263411 L 840.20233 96.263411 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4392
  </g>
4393
  <g id="line2d_9">
4394
  <g>
4395
+ <use ns4:href="#m0fca2865ba" x="47.72" y="96.263411" style="stroke: #000000; stroke-width: 0.8" />
4396
  </g>
4397
  </g>
4398
  <g id="text_9">
4399
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="40.72" y="100.06263" transform="rotate(-0 40.72 100.06263)">3.0</text>
4400
  </g>
4401
  </g>
4402
  <g id="label--y" class="ylabel">
 
4404
  </g>
4405
  </g>
4406
  <g id="series--torch-layer-norm" class="series">
4407
+ <path d="M 83.741924 437.689571 L 323.888085 303.198519 L 564.034245 314.331547 L 804.180406 46.442361 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4408
  <defs>
4409
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4410
  </defs>
4411
  <g clip-path="url(#p2214f54723)">
4412
  <use ns4:href="#md7efaf3aec" x="83.741924" y="437.689571" style="fill: #1f77b4; stroke: #1f77b4" />
4413
+ <use ns4:href="#md7efaf3aec" x="323.888085" y="303.198519" style="fill: #1f77b4; stroke: #1f77b4" />
4414
+ <use ns4:href="#md7efaf3aec" x="564.034245" y="314.331547" style="fill: #1f77b4; stroke: #1f77b4" />
4415
  <use ns4:href="#md7efaf3aec" x="804.180406" y="46.442361" style="fill: #1f77b4; stroke: #1f77b4" />
4416
  </g>
4417
  </g>
4418
  <g id="series--hf-kernels-layer-norm" class="series">
4419
+ <path d="M 83.741924 435.6307 L 323.888085 308.184835 L 564.034245 308.438456 L 804.180406 56.036284 " clip-path="url(#p2214f54723)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4420
  <defs>
4421
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4422
  </defs>
4423
  <g clip-path="url(#p2214f54723)">
4424
+ <use ns4:href="#m9b8c54d372" x="83.741924" y="435.6307" style="fill: #ff7f0e; stroke: #ff7f0e" />
4425
+ <use ns4:href="#m9b8c54d372" x="323.888085" y="308.184835" style="fill: #ff7f0e; stroke: #ff7f0e" />
4426
+ <use ns4:href="#m9b8c54d372" x="564.034245" y="308.438456" style="fill: #ff7f0e; stroke: #ff7f0e" />
4427
+ <use ns4:href="#m9b8c54d372" x="804.180406" y="56.036284" style="fill: #ff7f0e; stroke: #ff7f0e" />
4428
  </g>
4429
  </g>
4430
  <g id="patch_3">
openai_moe/impls/artifacts/benchmark/openai_moe.jsonl CHANGED
@@ -1,8 +1,8 @@
1
- {"ts": "2025-12-19T19:54:31Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.591275999975551, "p50": 2.6265569999850413, "p90": 2.6390279999759514, "mean": 2.626043199961714, "iqr": 0.02082100013467425, "raw_times": [2.591275999975551, 2.6390279999759514, 2.6265569999850413, 2.618206999841277, 2.6551480000307492], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.6624880001691054, "peak_bytes": 311252992, "ok": true, "absmax": 1.0818243026733398e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.0818243026733398e-05, "mae": 1.0733322142186807e-06, "mse": 1.9560496885423495e-12, "ref": "naive_moe"}, "err": null}
2
- {"ts": "2025-12-19T19:54:31Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.913345000000845, "p50": 3.932325000050696, "p90": 3.941766000025382, "mean": 3.9370316000258754, "iqr": 0.02511100001356681, "raw_times": [3.941766000025382, 3.913345000000845, 3.916655000011815, 3.981067000040639, 3.932325000050696], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.937866000114809, "peak_bytes": 632822272, "ok": true, "absmax": 7.82310962677002e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 7.82310962677002e-06, "mae": 5.576844728238939e-07, "mse": 5.436189692842319e-13, "ref": "naive_moe"}, "err": null}
3
- {"ts": "2025-12-19T19:54:32Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.80903300015234, "p50": 3.849652999861064, "p90": 3.853734000131226, "mean": 3.837069200062615, "iqr": 0.039670999967711396, "raw_times": [3.8140630001635145, 3.8588630000049307, 3.80903300015234, 3.853734000131226, 3.849652999861064], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.788761999885537, "peak_bytes": 645417472, "ok": true, "absmax": 1.5497207641601562e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5497207641601562e-05, "mae": 1.1454358173068613e-06, "mse": 2.2412421311207575e-12, "ref": "naive_moe"}, "err": null}
4
- {"ts": "2025-12-19T19:54:34Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 5.2778859999307315, "p50": 5.308016000071802, "p90": 5.336937000038233, "mean": 5.31205640004373, "iqr": 0.038680999978168984, "raw_times": [5.339187000117818, 5.336937000038233, 5.298256000060064, 5.308016000071802, 5.2778859999307315], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 5.26179400003457, "peak_bytes": 657099264, "ok": true, "absmax": 6.556510925292969e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 6.556510925292969e-06, "mae": 4.852234951613354e-07, "mse": 4.015021550906467e-13, "ref": "naive_moe"}, "err": null}
5
- {"ts": "2025-12-19T19:54:36Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 6.679864000034286, "p50": 6.717303999948854, "p90": 6.729205000056027, "mean": 6.711754200023279, "iqr": 0.028612000050998176, "raw_times": [6.679864000034286, 6.717303999948854, 6.700593000005028, 6.7318050000722, 6.729205000056027], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 6.593322000071566, "peak_bytes": 678357504, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 1.1745952406272409e-06, "mse": 2.316181968442521e-12, "ref": "naive_moe"}, "err": null}
6
- {"ts": "2025-12-19T19:54:38Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 7.423924000022453, "p50": 7.518165999954363, "p90": 7.529216999955679, "mean": 7.5042842000129895, "iqr": 0.02257999994981219, "raw_times": [7.543477000126586, 7.529216999955679, 7.518165999954363, 7.506637000005867, 7.423924000022453], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 7.323180999946999, "peak_bytes": 701983232, "ok": true, "absmax": 8.58306884765625e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.58306884765625e-06, "mae": 5.268635732136318e-07, "mse": 4.753664909623589e-13, "ref": "naive_moe"}, "err": null}
7
- {"ts": "2025-12-19T19:54:42Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.163481999981741, "p50": 13.23755299995355, "p90": 13.251324000066234, "mean": 13.23588719997133, "iqr": 0.04864200013798836, "raw_times": [13.163481999981741, 13.202681999928245, 13.23755299995355, 13.32439499992688, 13.251324000066234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 13.090128999920125, "peak_bytes": 1012207616, "ok": true, "absmax": 1.71661376953125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.71661376953125e-05, "mae": 1.797086838450923e-06, "mse": 5.3811247992252564e-12, "ref": "naive_moe"}, "err": null}
8
- {"ts": "2025-12-19T19:54:46Z", "run": "ca1c99ac13bb4217b0fb0c501a249580", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.26829100003124, "p50": 13.362623000148233, "p90": 13.40691399991556, "mean": 13.346813000043767, "iqr": 0.1288519999889104, "raw_times": [13.40691399991556, 13.418175000197152, 13.26829100003124, 13.27806199992665, 13.362623000148233], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 12.873562000095262, "peak_bytes": 910968320, "ok": true, "absmax": 8.344650268554688e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.344650268554688e-06, "mae": 5.471991357808292e-07, "mse": 5.06310813587485e-13, "ref": "naive_moe"}, "err": null}
 
1
+ {"ts": "2025-12-19T23:01:31Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E2", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 2.642566999838891, "p50": 2.6590969998778746, "p90": 2.673486999810848, "mean": 2.659981199894901, "iqr": 0.023999999939405825, "raw_times": [2.6590969998778746, 2.675268000075448, 2.649486999871442, 2.642566999838891, 2.673486999810848], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 2.7064890000474406, "peak_bytes": 311252992, "ok": true, "absmax": 1.0818243026733398e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.0818243026733398e-05, "mae": 1.0733322142186807e-06, "mse": 1.9560496885423495e-12, "ref": "naive_moe"}, "err": null}
2
+ {"ts": "2025-12-19T23:01:32Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S512_E4", "batch": 1, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.936204999945403, "p50": 3.9538260000426817, "p90": 3.9835660002154327, "mean": 3.9606518000255164, "iqr": 0.039130000232034945, "raw_times": [3.936204999945403, 3.9538260000426817, 3.9835660002154327, 3.985225999940667, 3.944435999983398], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.9596259998688765, "peak_bytes": 632822272, "ok": true, "absmax": 7.82310962677002e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 7.82310962677002e-06, "mae": 5.576844728238939e-07, "mse": 5.436189692842319e-13, "ref": "naive_moe"}, "err": null}
3
+ {"ts": "2025-12-19T23:01:33Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E2", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 3.817872999889005, "p50": 3.868872999873929, "p90": 3.9019339999413205, "mean": 3.8749997999275365, "iqr": 0.044331000026431866, "raw_times": [3.817872999889005, 3.8576029999148886, 3.9287160000185395, 3.9019339999413205, 3.868872999873929], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 3.836012999954619, "peak_bytes": 645417472, "ok": true, "absmax": 1.5497207641601562e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.5497207641601562e-05, "mae": 1.1454358173068613e-06, "mse": 2.2412421311207575e-12, "ref": "naive_moe"}, "err": null}
4
+ {"ts": "2025-12-19T23:01:34Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B1_S1024_E4", "batch": 1, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 5.3247949999786215, "p50": 5.3401449999910255, "p90": 5.39184700005535, "mean": 5.356893600037438, "iqr": 0.06286200004979037, "raw_times": [5.39184700005535, 5.3247949999786215, 5.3401449999910255, 5.328985000005559, 5.398696000156633], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 5.315443999961644, "peak_bytes": 657099264, "ok": true, "absmax": 6.556510925292969e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 6.556510925292969e-06, "mae": 4.852234951613354e-07, "mse": 4.015021550906467e-13, "ref": "naive_moe"}, "err": null}
5
+ {"ts": "2025-12-19T23:01:36Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E2", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 6.797146999815595, "p50": 6.804686999885234, "p90": 6.806136000022889, "mean": 6.814822799969988, "iqr": 0.0027099999897473026, "raw_times": [6.862718000093082, 6.806136000022889, 6.797146999815595, 6.8034260000331415, 6.804686999885234], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 6.6412220000984235, "peak_bytes": 678357504, "ok": true, "absmax": 1.3589859008789062e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.3589859008789062e-05, "mae": 1.1745952406272409e-06, "mse": 2.316181968442521e-12, "ref": "naive_moe"}, "err": null}
6
+ {"ts": "2025-12-19T23:01:38Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S512_E4", "batch": 4, "seq_len": 512, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 7.520542000065689, "p50": 7.530022999844732, "p90": 7.53409300000385, "mean": 7.531816999926377, "iqr": 0.0065400001858506585, "raw_times": [7.520542000065689, 7.527552999817999, 7.546873999899617, 7.530022999844732, 7.53409300000385], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 7.356247999950938, "peak_bytes": 701983232, "ok": true, "absmax": 8.58306884765625e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.58306884765625e-06, "mae": 5.268635732136318e-07, "mse": 4.753664909623589e-13, "ref": "naive_moe"}, "err": null}
7
+ {"ts": "2025-12-19T23:01:42Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E2", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 2, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.378247999980886, "p50": 13.385679999828426, "p90": 13.397299999951429, "mean": 13.394303199947899, "iqr": 0.012501999890446314, "raw_times": [13.378247999980886, 13.384798000060982, 13.425489999917772, 13.385679999828426, 13.397299999951429], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 13.166785000066739, "peak_bytes": 1012207616, "ok": true, "absmax": 1.71661376953125e-05, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 1.71661376953125e-05, "mae": 1.797086838450923e-06, "mse": 5.3811247992252564e-12, "ref": "naive_moe"}, "err": null}
8
+ {"ts": "2025-12-19T23:01:46Z", "run": "7f201a1c2ed74237ad40459314918ba0", "impl": "gpt_oss_experts", "tags": {"family": "reference", "backend": "pytorch"}, "wl": {"name": "cuda_B4_S1024_E4", "batch": 4, "seq_len": 1024, "hidden_dim": 2880, "expert_dim": 5760, "num_experts": 4, "top_k": 2, "dtype": "float32", "device": "cuda"}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 13.328448000038406, "p50": 13.40927800015379, "p90": 13.441681000131211, "mean": 13.402939000070546, "iqr": 0.0636730001133401, "raw_times": [13.457280000011451, 13.441681000131211, 13.40927800015379, 13.378008000017871, 13.328448000038406], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 13.036729999839736, "peak_bytes": 910968320, "ok": true, "absmax": 8.344650268554688e-06, "corr": {"ok": true, "rtol": 0.01, "atol": 0.01, "absmax": 8.344650268554688e-06, "mae": 5.471991357808292e-07, "mse": 5.06310813587485e-13, "ref": "naive_moe"}, "err": null}
openai_moe/impls/binned_torch.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.28s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3904,7 +3904,7 @@ Cell: nv | 0.28s
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:41:48 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
@@ -3913,7 +3913,7 @@ Cell: nv | 0.28s
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
- | N/A 33C P0 126W / 350W | 0MiB / 46068MiB | 100% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
@@ -3937,7 +3937,7 @@ Cell: nv | 0.28s
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
- Cell: benchmark | 733.46s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4095,29 +4095,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E2
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 935.516ms 1843.92% 935.516ms 935.516ms 1
4099
- binned_torch 24.73% 231.815ms 100.00% 937.553ms 937.553ms 0.000us 0.00% 50.740ms 50.740ms 1
4100
- aten::item 1.92% 17.997ms 26.19% 245.573ms 16.003us 0.000us 0.00% 15.756ms 1.027us 15345
4101
- aten::_local_scalar_dense 6.46% 60.533ms 24.27% 227.576ms 14.831us 15.755ms 31.05% 15.756ms 1.027us 15345
4102
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.755ms 31.05% 15.755ms 1.027us 15345
4103
- aten::floor_divide 5.33% 49.954ms 13.00% 121.926ms 19.845us 7.813ms 15.40% 7.813ms 1.272us 6144
4104
- aten::bmm 0.02% 192.684us 0.02% 232.345us 38.724us 7.792ms 15.36% 7.792ms 1.299ms 6
4105
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.792ms 15.36% 7.792ms 1.299ms 6
4106
- aten::copy_ 3.73% 34.970ms 9.17% 86.008ms 13.971us 6.589ms 12.99% 6.590ms 1.071us 6156
4107
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.585ms 12.98% 6.585ms 1.070us 6153
4108
- aten::mul 3.28% 30.750ms 5.69% 53.382ms 17.326us 4.708ms 9.28% 4.708ms 1.528us 3081
4109
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.480ms 8.83% 4.480ms 1.458us 3072
4110
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.159ms 8.20% 4.159ms 1.354us 3072
4111
- aten::remainder 3.15% 29.490ms 4.77% 44.737ms 14.563us 3.838ms 7.56% 3.838ms 1.249us 3072
4112
- aten::add 2.76% 25.910ms 4.76% 44.643ms 14.719us 3.755ms 7.40% 3.755ms 1.238us 3033
4113
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.655ms 7.20% 3.655ms 1.190us 3072
4114
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.364ms 6.63% 3.364ms 1.110us 3030
4115
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.022ms 3.99% 2.022ms 1.316us 1536
4116
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.816ms 3.58% 1.816ms 1.182us 1536
4117
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 284.802us 0.56% 284.802us 47.467us 6
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
- Self CPU time total: 937.562ms
4120
- Self CUDA time total: 50.735ms
4121
 
4122
 
4123
 
@@ -4127,29 +4127,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S512_E4
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 958.363ms 1758.28% 958.363ms 958.363ms 1
4131
- binned_torch 24.25% 232.525ms 100.00% 958.754ms 958.754ms 0.000us 0.00% 54.510ms 54.510ms 1
4132
- aten::item 1.77% 17.002ms 27.44% 263.071ms 15.534us 0.000us 0.00% 17.862ms 1.055us 16935
4133
- aten::_local_scalar_dense 6.54% 62.707ms 25.67% 246.070ms 14.530us 17.860ms 32.77% 17.862ms 1.055us 16935
4134
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.860ms 32.77% 17.860ms 1.055us 16935
4135
- aten::bmm 0.02% 170.065us 0.02% 212.615us 35.436us 7.895ms 14.48% 7.895ms 1.316ms 6
4136
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.895ms 14.48% 7.895ms 1.316ms 6
4137
- aten::floor_divide 4.96% 47.565ms 12.31% 117.977ms 19.202us 7.812ms 14.33% 7.813ms 1.272us 6144
4138
- aten::copy_ 3.61% 34.645ms 8.68% 83.187ms 13.513us 6.631ms 12.17% 6.631ms 1.077us 6156
4139
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.628ms 12.16% 6.628ms 1.077us 6152
4140
- aten::add 3.91% 37.531ms 7.22% 69.217ms 15.070us 5.262ms 9.65% 5.262ms 1.146us 4593
4141
- aten::mul 3.03% 29.029ms 5.30% 50.820ms 16.495us 4.703ms 8.63% 4.703ms 1.526us 3081
4142
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.476ms 8.21% 4.476ms 1.457us 3072
4143
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.156ms 7.62% 4.156ms 1.353us 3072
4144
- aten::remainder 2.84% 27.273ms 4.45% 42.673ms 13.891us 3.854ms 7.07% 3.854ms 1.255us 3072
4145
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.656ms 6.71% 3.656ms 1.190us 3072
4146
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.271ms 6.00% 3.271ms 1.080us 3030
4147
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.031ms 3.73% 2.031ms 1.323us 1536
4148
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.822ms 3.34% 1.822ms 1.187us 1536
4149
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.585ms 2.91% 1.585ms 1.016us 1560
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
- Self CPU time total: 958.762ms
4152
- Self CUDA time total: 54.506ms
4153
 
4154
 
4155
 
@@ -4159,29 +4159,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E2
4159
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4160
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.754s 1688.21% 1.754s 1.754s 1
4163
- binned_torch 24.13% 423.200ms 100.00% 1.754s 1.754s 0.000us 0.00% 103.889ms 103.889ms 1
4164
- aten::item 1.68% 29.485ms 26.54% 465.492ms 15.256us 0.000us 0.00% 31.587ms 1.035us 30513
4165
- aten::_local_scalar_dense 6.17% 108.158ms 24.86% 436.007ms 14.289us 31.585ms 30.40% 31.587ms 1.035us 30513
4166
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.585ms 30.40% 31.585ms 1.035us 30513
4167
- aten::floor_divide 5.33% 93.524ms 13.33% 233.711ms 19.019us 15.605ms 15.02% 15.605ms 1.270us 12288
4168
- aten::bmm 0.01% 221.157us 0.02% 267.387us 44.564us 15.098ms 14.53% 15.098ms 2.516ms 6
4169
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.098ms 14.53% 15.098ms 2.516ms 6
4170
- aten::copy_ 3.90% 68.459ms 9.45% 165.766ms 13.477us 13.325ms 12.83% 13.325ms 1.083us 12300
4171
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.322ms 12.82% 13.322ms 1.084us 12294
4172
- aten::mul 3.29% 57.635ms 5.89% 103.357ms 16.798us 11.271ms 10.85% 11.273ms 1.832us 6153
4173
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.920ms 9.55% 9.920ms 1.615us 6144
4174
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.308ms 8.00% 8.308ms 1.352us 6144
4175
- aten::remainder 3.09% 54.193ms 4.85% 85.026ms 13.839us 7.675ms 7.39% 7.675ms 1.249us 6144
4176
- aten::add 2.79% 48.989ms 4.92% 86.297ms 14.595us 7.638ms 7.35% 7.639ms 1.292us 5913
4177
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.297ms 7.02% 7.297ms 1.188us 6144
4178
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.357ms 6.12% 6.357ms 1.076us 5910
4179
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.044ms 3.89% 4.044ms 1.317us 3072
4180
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.632ms 3.50% 3.632ms 1.182us 3072
4181
- aten::clamp 0.00% 73.899us 0.01% 123.411us 20.569us 1.193ms 1.15% 1.193ms 198.833us 6
4182
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4183
- Self CPU time total: 1.754s
4184
- Self CUDA time total: 103.882ms
4185
 
4186
 
4187
 
@@ -4191,29 +4191,29 @@ PROFILE TRACE: binned_torch | cuda_B1_S1024_E4
4191
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4192
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.874s 1695.99% 1.874s 1.874s 1
4195
- binned_torch 24.25% 455.076ms 100.00% 1.876s 1.876s 0.000us 0.00% 110.516ms 110.516ms 1
4196
- aten::item 1.77% 33.154ms 27.43% 514.675ms 15.259us 0.000us 0.00% 34.979ms 1.037us 33729
4197
- aten::_local_scalar_dense 6.27% 117.583ms 25.66% 481.520ms 14.276us 34.976ms 31.65% 34.979ms 1.037us 33729
4198
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 34.976ms 31.65% 34.976ms 1.037us 33728
4199
- aten::floor_divide 4.89% 91.819ms 12.09% 226.952ms 18.469us 15.582ms 14.10% 15.582ms 1.268us 12288
4200
- aten::bmm 0.01% 222.715us 0.01% 267.616us 44.603us 15.462ms 13.99% 15.462ms 2.577ms 6
4201
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.462ms 13.99% 15.462ms 2.577ms 6
4202
- aten::copy_ 3.58% 67.106ms 8.62% 161.781ms 13.153us 13.339ms 12.07% 13.341ms 1.085us 12300
4203
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.337ms 12.07% 13.337ms 1.085us 12294
4204
- aten::mul 3.09% 57.893ms 5.35% 100.363ms 16.311us 10.926ms 9.89% 10.927ms 1.776us 6153
4205
- aten::add 4.06% 76.225ms 6.94% 130.290ms 14.319us 10.845ms 9.81% 10.845ms 1.192us 9099
4206
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.572ms 8.66% 9.572ms 1.558us 6144
4207
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.302ms 7.51% 8.302ms 1.351us 6144
4208
- aten::remainder 2.99% 56.031ms 4.55% 85.473ms 13.912us 7.682ms 6.95% 7.682ms 1.250us 6144
4209
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.280ms 6.59% 7.280ms 1.185us 6144
4210
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.358ms 5.75% 6.358ms 1.076us 5910
4211
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.050ms 3.67% 4.050ms 1.318us 3072
4212
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.631ms 3.29% 3.631ms 1.182us 3072
4213
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.228ms 2.92% 3.228ms 1.013us 3186
4214
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4215
- Self CPU time total: 1.876s
4216
- Self CUDA time total: 110.507ms
4217
 
4218
 
4219
 
@@ -4223,29 +4223,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E2
4223
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4224
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.610s 1697.16% 3.610s 3.610s 1
4227
- binned_torch 23.68% 855.222ms 100.00% 3.611s 3.611s 0.000us 0.00% 212.735ms 212.735ms 1
4228
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.569ms 29.88% 63.569ms 1.032us 61587
4229
- aten::item 1.81% 65.197ms 27.34% 987.119ms 16.028us 0.000us 0.00% 63.568ms 1.032us 61587
4230
- aten::_local_scalar_dense 6.48% 233.826ms 25.53% 921.922ms 14.969us 63.567ms 29.88% 63.568ms 1.032us 61587
4231
- aten::floor_divide 5.24% 189.036ms 13.02% 470.235ms 19.134us 31.579ms 14.85% 31.582ms 1.285us 24576
4232
- aten::bmm 0.01% 232.455us 0.01% 281.845us 46.974us 29.001ms 13.63% 29.001ms 4.833ms 6
4233
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.001ms 13.63% 29.001ms 4.833ms 6
4234
- aten::copy_ 3.67% 132.477ms 9.25% 334.079ms 13.587us 26.719ms 12.56% 26.722ms 1.087us 24588
4235
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.715ms 12.56% 26.715ms 1.087us 24585
4236
- aten::mul 3.15% 113.903ms 5.68% 205.201ms 16.687us 25.580ms 12.03% 25.582ms 2.080us 12297
4237
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.132ms 10.40% 22.132ms 1.801us 12288
4238
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.992ms 7.99% 16.992ms 1.383us 12288
4239
- aten::add 2.81% 101.355ms 4.98% 179.658ms 14.476us 16.634ms 7.82% 16.635ms 1.340us 12411
4240
- aten::remainder 3.15% 113.609ms 4.99% 180.020ms 14.650us 15.413ms 7.25% 15.415ms 1.255us 12288
4241
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.588ms 6.86% 14.588ms 1.187us 12288
4242
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.512ms 6.35% 13.512ms 1.089us 12408
4243
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.121ms 3.82% 8.121ms 1.322us 6144
4244
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.292ms 3.43% 7.292ms 1.187us 6144
4245
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.612ms 1.23% 2.612ms 435.298us 6
4246
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4247
- Self CPU time total: 3.611s
4248
- Self CUDA time total: 212.720ms
4249
 
4250
 
4251
 
@@ -4255,29 +4255,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S512_E4
4255
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4256
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.762s 1666.70% 3.762s 3.762s 1
4259
- binned_torch 23.91% 899.748ms 100.00% 3.764s 3.764s 0.000us 0.00% 225.734ms 225.734ms 1
4260
- aten::item 1.82% 68.620ms 27.46% 1.034s 15.235us 0.000us 0.00% 69.795ms 1.029us 67845
4261
- aten::_local_scalar_dense 6.31% 237.441ms 25.64% 964.994ms 14.224us 69.792ms 30.92% 69.795ms 1.029us 67845
4262
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 69.793ms 30.92% 69.793ms 1.029us 67840
4263
- aten::floor_divide 4.95% 186.290ms 12.17% 458.105ms 18.640us 31.553ms 13.98% 31.560ms 1.284us 24576
4264
- aten::bmm 0.01% 226.315us 0.01% 272.505us 45.418us 29.269ms 12.97% 29.269ms 4.878ms 6
4265
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.269ms 12.97% 29.269ms 4.878ms 6
4266
- aten::copy_ 3.56% 134.013ms 8.54% 321.380ms 13.071us 26.742ms 11.85% 26.743ms 1.088us 24588
4267
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.740ms 11.85% 26.740ms 1.088us 24581
4268
- aten::mul 3.06% 115.077ms 5.31% 199.757ms 16.244us 25.618ms 11.35% 25.618ms 2.083us 12297
4269
- aten::add 4.14% 155.825ms 7.08% 266.365ms 14.291us 23.275ms 10.31% 23.276ms 1.249us 18639
4270
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.160ms 9.82% 22.160ms 1.803us 12288
4271
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 17.005ms 7.53% 17.005ms 1.384us 12287
4272
- aten::remainder 2.93% 110.282ms 4.49% 168.952ms 13.749us 15.362ms 6.81% 15.364ms 1.250us 12288
4273
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.548ms 6.45% 14.548ms 1.184us 12287
4274
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.690ms 6.07% 13.690ms 1.103us 12407
4275
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.098ms 3.59% 8.098ms 1.318us 6144
4276
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.264ms 3.22% 7.264ms 1.182us 6144
4277
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.476ms 2.87% 6.476ms 1.040us 6228
4278
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4279
- Self CPU time total: 3.764s
4280
- Self CUDA time total: 225.722ms
4281
 
4282
 
4283
 
@@ -4287,29 +4287,29 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E2
4287
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4288
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.172s 1685.34% 7.172s 7.172s 1
4291
- binned_torch 23.83% 1.712s 100.00% 7.184s 7.184s 0.000us 0.00% 425.602ms 425.602ms 1
4292
- aten::item 1.77% 127.233ms 27.17% 1.952s 15.898us 0.000us 0.00% 127.069ms 1.035us 122763
4293
- aten::_local_scalar_dense 6.22% 446.668ms 25.40% 1.825s 14.862us 127.060ms 29.86% 127.069ms 1.035us 122763
4294
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 127.060ms 29.86% 127.060ms 1.035us 122762
4295
- aten::floor_divide 5.22% 375.373ms 13.07% 938.750ms 19.099us 63.372ms 14.89% 63.374ms 1.289us 49152
4296
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 57.057ms 13.41% 57.057ms 9.509ms 6
4297
- aten::bmm 0.00% 232.954us 0.00% 280.556us 46.759us 57.057ms 13.41% 57.057ms 9.509ms 6
4298
- aten::copy_ 3.67% 263.382ms 9.14% 656.814ms 13.361us 53.605ms 12.60% 53.606ms 1.090us 49158
4299
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.603ms 12.60% 53.603ms 1.091us 49154
4300
- aten::mul 3.19% 229.239ms 5.71% 410.065ms 16.679us 51.561ms 12.12% 51.568ms 2.098us 24585
4301
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.597ms 10.48% 44.597ms 1.815us 24576
4302
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.170ms 8.03% 34.170ms 1.390us 24576
4303
- aten::add 2.78% 199.917ms 4.97% 356.982ms 14.609us 33.583ms 7.89% 33.584ms 1.374us 24435
4304
- aten::remainder 3.17% 227.943ms 4.97% 356.780ms 14.517us 30.902ms 7.26% 30.903ms 1.257us 24576
4305
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.202ms 6.86% 29.202ms 1.188us 24576
4306
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.924ms 6.33% 26.924ms 1.102us 24431
4307
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.278ms 3.82% 16.278ms 1.325us 12288
4308
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.628ms 3.44% 14.628ms 1.190us 12288
4309
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.242ms 1.23% 5.242ms 873.601us 6
4310
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4311
- Self CPU time total: 7.184s
4312
- Self CUDA time total: 425.579ms
4313
 
4314
 
4315
 
@@ -4319,45 +4319,45 @@ PROFILE TRACE: binned_torch | cuda_B4_S1024_E4
4319
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4320
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4321
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4322
- binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.590s 1687.04% 7.590s 7.590s 1
4323
- binned_torch 23.93% 1.817s 100.00% 7.592s 7.592s 0.000us 0.00% 449.935ms 449.935ms 1
4324
- aten::item 1.74% 131.929ms 27.26% 2.070s 15.365us 0.000us 0.00% 139.467ms 1.035us 134715
4325
- aten::_local_scalar_dense 6.36% 483.083ms 25.53% 1.938s 14.386us 139.456ms 31.00% 139.467ms 1.035us 134715
4326
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 139.456ms 31.00% 139.456ms 1.035us 134706
4327
- aten::floor_divide 4.94% 375.293ms 12.19% 925.665ms 18.833us 63.455ms 14.10% 63.460ms 1.291us 49152
4328
- aten::bmm 0.00% 234.075us 0.00% 282.947us 47.158us 56.663ms 12.59% 56.663ms 9.444ms 6
4329
- ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 56.663ms 12.59% 56.663ms 9.444ms 6
4330
- aten::copy_ 3.75% 285.044ms 8.75% 664.131ms 13.510us 53.858ms 11.97% 53.860ms 1.096us 49158
4331
- Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.855ms 11.97% 53.855ms 1.096us 49149
4332
- aten::mul 3.08% 233.920ms 5.34% 405.684ms 16.501us 51.582ms 11.47% 51.587ms 2.098us 24585
4333
- aten::add 3.87% 294.168ms 6.87% 521.854ms 14.354us 45.530ms 10.12% 45.534ms 1.252us 36357
4334
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.640ms 9.92% 44.640ms 1.816us 24576
4335
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 34.166ms 7.59% 34.166ms 1.390us 24573
4336
- aten::remainder 2.91% 220.707ms 4.59% 348.339ms 14.174us 30.841ms 6.86% 30.843ms 1.255us 24576
4337
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 29.291ms 6.51% 29.291ms 1.192us 24573
4338
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 26.632ms 5.92% 26.632ms 1.090us 24431
4339
- void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.258ms 3.61% 16.258ms 1.323us 12288
4340
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.582ms 3.24% 14.582ms 1.187us 12288
4341
- void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 12.272ms 2.73% 12.272ms 1.029us 11922
4342
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4343
- Self CPU time total: 7.592s
4344
- Self CUDA time total: 449.893ms
4345
 
4346
 
4347
  impl wl p50(ms) ok
4348
- binned_torch cuda_B1_S1024_E2 377.89 True
4349
- binned_torch cuda_B1_S1024_E4 408.91 True
4350
- binned_torch cuda_B1_S512_E2 158.27 True
4351
- binned_torch cuda_B1_S512_E4 209.01 True
4352
- binned_torch cuda_B4_S1024_E2 1516.51 True
4353
- binned_torch cuda_B4_S1024_E4 1643.14 True
4354
- binned_torch cuda_B4_S512_E2 769.64 True
4355
- binned_torch cuda_B4_S512_E4 816.95 True
4356
  </pre></div>
4357
  <div class="uv-install-logs" id="uv-logs-benchmark">
4358
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4359
  <div class="uv-logs-content" style="display: none;">
4360
- Installed 37 packages in 284ms
4361
  </div>
4362
  </div>
4363
  <div class="cell-artifacts">
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3904
  </div>
3905
  </div>
3906
  <div id="output-nv" class="cell-output">
3907
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:00:37 2025
3908
  +-----------------------------------------------------------------------------------------+
3909
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3910
  +-----------------------------------------+------------------------+----------------------+
 
3913
  | | | MIG M. |
3914
  |=========================================+========================+======================|
3915
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3916
+ | N/A 40C P0 84W / 350W | 0MiB / 46068MiB | 60% Default |
3917
  | | | N/A |
3918
  +-----------------------------------------+------------------------+----------------------+
3919
 
 
3937
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3938
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3939
  </span> |
3940
+ Cell: benchmark | 723.84s
3941
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3942
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3943
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4095
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4096
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 916.334ms 1818.27% 916.334ms 916.334ms 1
4099
+ binned_torch 24.63% 226.221ms 100.00% 918.346ms 918.346ms 0.000us 0.00% 50.398ms 50.398ms 1
4100
+ aten::item 1.84% 16.915ms 25.73% 236.247ms 15.396us 0.000us 0.00% 15.727ms 1.025us 15345
4101
+ aten::_local_scalar_dense 5.92% 54.373ms 23.88% 219.332ms 14.293us 15.726ms 31.20% 15.727ms 1.025us 15345
4102
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 15.726ms 31.20% 15.726ms 1.025us 15345
4103
+ aten::bmm 0.02% 194.226us 0.03% 236.195us 39.366us 8.013ms 15.90% 8.013ms 1.336ms 6
4104
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.013ms 15.90% 8.013ms 1.336ms 6
4105
+ aten::floor_divide 5.35% 49.157ms 13.15% 120.743ms 19.652us 7.547ms 14.98% 7.547ms 1.228us 6144
4106
+ aten::copy_ 3.75% 34.457ms 9.21% 84.535ms 13.732us 6.589ms 13.08% 6.592ms 1.071us 6156
4107
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.585ms 13.07% 6.585ms 1.070us 6153
4108
+ aten::mul 3.14% 28.847ms 5.63% 51.742ms 16.794us 4.707ms 9.34% 4.707ms 1.528us 3081
4109
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.479ms 8.89% 4.479ms 1.458us 3072
4110
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.026ms 7.99% 4.026ms 1.311us 3072
4111
+ aten::remainder 3.09% 28.363ms 4.76% 43.750ms 14.241us 3.702ms 7.35% 3.702ms 1.205us 3072
4112
+ aten::add 2.79% 25.584ms 4.81% 44.150ms 14.557us 3.631ms 7.20% 3.631ms 1.197us 3033
4113
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.522ms 6.99% 3.522ms 1.147us 3072
4114
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.235ms 6.42% 3.235ms 1.068us 3030
4115
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.954ms 3.88% 1.954ms 1.272us 1536
4116
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.749ms 3.47% 1.749ms 1.138us 1536
4117
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 287.138us 0.57% 287.138us 47.856us 6
4118
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4119
+ Self CPU time total: 918.353ms
4120
+ Self CUDA time total: 50.396ms
4121
 
4122
 
4123
 
 
4127
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4128
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 930.604ms 1724.65% 930.604ms 930.604ms 1
4131
+ binned_torch 24.29% 226.115ms 100.00% 930.865ms 930.865ms 0.000us 0.00% 53.966ms 53.966ms 1
4132
+ aten::item 1.81% 16.815ms 27.55% 256.425ms 15.142us 0.000us 0.00% 17.838ms 1.053us 16935
4133
+ aten::_local_scalar_dense 6.14% 57.141ms 25.74% 239.611ms 14.149us 17.835ms 33.05% 17.838ms 1.053us 16935
4134
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 17.835ms 33.05% 17.835ms 1.053us 16935
4135
+ aten::bmm 0.02% 175.424us 0.02% 217.325us 36.221us 7.967ms 14.77% 7.967ms 1.328ms 6
4136
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 7.967ms 14.77% 7.967ms 1.328ms 6
4137
+ aten::floor_divide 5.05% 47.005ms 12.57% 117.000ms 19.043us 7.550ms 13.99% 7.551ms 1.229us 6144
4138
+ aten::copy_ 3.51% 32.640ms 8.36% 77.831ms 12.643us 6.635ms 12.30% 6.635ms 1.078us 6156
4139
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 6.632ms 12.29% 6.632ms 1.078us 6152
4140
+ aten::add 3.89% 36.256ms 6.95% 64.697ms 14.086us 5.059ms 9.38% 5.059ms 1.102us 4593
4141
+ aten::mul 2.92% 27.144ms 5.32% 49.502ms 16.067us 4.707ms 8.72% 4.707ms 1.528us 3081
4142
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 4.479ms 8.30% 4.479ms 1.458us 3072
4143
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 4.026ms 7.46% 4.026ms 1.310us 3072
4144
+ aten::remainder 2.81% 26.197ms 4.49% 41.800ms 13.607us 3.721ms 6.90% 3.721ms 1.211us 3072
4145
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.524ms 6.53% 3.524ms 1.147us 3072
4146
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.140ms 5.82% 3.140ms 1.036us 3030
4147
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.965ms 3.64% 1.965ms 1.279us 1536
4148
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.756ms 3.25% 1.756ms 1.143us 1536
4149
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 1.517ms 2.81% 1.517ms 0.972us 1560
4150
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4151
+ Self CPU time total: 930.874ms
4152
+ Self CUDA time total: 53.959ms
4153
 
4154
 
4155
 
 
4159
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4160
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.706s 1653.15% 1.706s 1.706s 1
4163
+ binned_torch 24.03% 409.734ms 100.00% 1.705s 1.705s 0.000us 0.00% 103.183ms 103.183ms 1
4164
+ aten::item 1.59% 27.070ms 26.54% 452.490ms 14.829us 0.000us 0.00% 31.572ms 1.035us 30513
4165
+ aten::_local_scalar_dense 5.90% 100.602ms 24.95% 425.421ms 13.942us 31.568ms 30.60% 31.572ms 1.035us 30513
4166
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 31.568ms 30.60% 31.568ms 1.035us 30513
4167
+ aten::bmm 0.01% 213.024us 0.02% 261.877us 43.646us 15.473ms 15.00% 15.473ms 2.579ms 6
4168
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.473ms 15.00% 15.473ms 2.579ms 6
4169
+ aten::floor_divide 5.42% 92.355ms 13.36% 227.861ms 18.543us 15.078ms 14.61% 15.078ms 1.227us 12288
4170
+ aten::copy_ 3.96% 67.445ms 9.41% 160.444ms 13.044us 13.330ms 12.92% 13.330ms 1.084us 12300
4171
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.326ms 12.92% 13.326ms 1.084us 12294
4172
+ aten::mul 3.18% 54.204ms 5.76% 98.288ms 15.974us 11.263ms 10.92% 11.265ms 1.831us 6153
4173
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.919ms 9.61% 9.919ms 1.614us 6144
4174
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.044ms 7.80% 8.044ms 1.309us 6144
4175
+ aten::remainder 3.09% 52.622ms 4.84% 82.495ms 13.427us 7.409ms 7.18% 7.409ms 1.206us 6144
4176
+ aten::add 2.82% 48.063ms 4.95% 84.371ms 14.269us 7.380ms 7.15% 7.380ms 1.248us 5913
4177
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.034ms 6.82% 7.034ms 1.145us 6144
4178
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.098ms 5.91% 6.098ms 1.032us 5910
4179
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.912ms 3.79% 3.912ms 1.273us 3072
4180
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.498ms 3.39% 3.498ms 1.139us 3072
4181
+ aten::clamp 0.00% 70.381us 0.01% 115.343us 19.224us 1.182ms 1.15% 1.182ms 197.026us 6
4182
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4183
+ Self CPU time total: 1.705s
4184
+ Self CUDA time total: 103.179ms
4185
 
4186
 
4187
 
 
4191
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4192
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 1.835s 1676.06% 1.835s 1.835s 1
4195
+ binned_torch 24.11% 442.690ms 100.00% 1.836s 1.836s 0.000us 0.00% 109.503ms 109.503ms 1
4196
+ aten::item 1.62% 29.702ms 27.50% 504.982ms 14.972us 0.000us 0.00% 35.015ms 1.038us 33729
4197
+ aten::_local_scalar_dense 6.21% 114.112ms 25.88% 475.279ms 14.091us 35.012ms 31.97% 35.015ms 1.038us 33729
4198
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 35.012ms 31.97% 35.012ms 1.038us 33728
4199
+ aten::bmm 0.01% 232.655us 0.02% 282.685us 47.114us 15.567ms 14.22% 15.567ms 2.595ms 6
4200
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 15.567ms 14.22% 15.567ms 2.595ms 6
4201
+ aten::floor_divide 5.11% 93.914ms 12.52% 229.926ms 18.711us 15.067ms 13.76% 15.067ms 1.226us 12288
4202
+ aten::copy_ 3.50% 64.191ms 8.58% 157.627ms 12.815us 13.353ms 12.19% 13.355ms 1.086us 12300
4203
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 13.350ms 12.19% 13.350ms 1.086us 12294
4204
+ aten::mul 2.97% 54.553ms 5.34% 97.962ms 15.921us 10.925ms 9.98% 10.925ms 1.776us 6153
4205
+ aten::add 3.96% 72.764ms 6.93% 127.157ms 13.975us 10.457ms 9.55% 10.457ms 1.149us 9099
4206
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 9.572ms 8.74% 9.572ms 1.558us 6144
4207
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 8.046ms 7.35% 8.046ms 1.310us 6144
4208
+ aten::remainder 2.95% 54.099ms 4.66% 85.633ms 13.938us 7.422ms 6.78% 7.422ms 1.208us 6144
4209
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.021ms 6.41% 7.021ms 1.143us 6144
4210
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 6.106ms 5.58% 6.106ms 1.033us 5910
4211
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 3.920ms 3.58% 3.920ms 1.276us 3072
4212
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.502ms 3.20% 3.502ms 1.140us 3072
4213
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 3.094ms 2.83% 3.094ms 0.971us 3186
4214
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4215
+ Self CPU time total: 1.836s
4216
+ Self CUDA time total: 109.497ms
4217
 
4218
 
4219
 
 
4223
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4224
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.483s 1652.23% 3.483s 3.483s 1
4227
+ binned_torch 24.18% 842.026ms 100.00% 3.482s 3.482s 0.000us 0.00% 210.838ms 210.838ms 1
4228
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 63.561ms 30.15% 63.561ms 1.032us 61586
4229
+ aten::item 1.74% 60.466ms 26.96% 938.865ms 15.245us 0.000us 0.00% 63.559ms 1.032us 61587
4230
+ aten::_local_scalar_dense 6.04% 210.488ms 25.22% 878.295ms 14.261us 63.559ms 30.15% 63.559ms 1.032us 61587
4231
+ aten::floor_divide 5.38% 187.378ms 13.29% 462.870ms 18.834us 30.531ms 14.48% 30.538ms 1.243us 24576
4232
+ aten::bmm 0.01% 232.923us 0.01% 283.154us 47.192us 29.267ms 13.88% 29.267ms 4.878ms 6
4233
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.267ms 13.88% 29.267ms 4.878ms 6
4234
+ aten::copy_ 3.71% 129.087ms 8.89% 309.556ms 12.590us 26.727ms 12.68% 26.728ms 1.087us 24588
4235
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.725ms 12.68% 26.725ms 1.087us 24582
4236
+ aten::mul 3.12% 108.737ms 5.69% 198.327ms 16.128us 25.576ms 12.13% 25.578ms 2.080us 12297
4237
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.134ms 10.50% 22.134ms 1.801us 12288
4238
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.473ms 7.81% 16.473ms 1.341us 12288
4239
+ aten::add 2.81% 97.833ms 4.96% 172.866ms 13.928us 16.092ms 7.63% 16.093ms 1.297us 12411
4240
+ aten::remainder 3.07% 106.957ms 4.82% 167.982ms 13.670us 14.887ms 7.06% 14.889ms 1.212us 12288
4241
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.058ms 6.67% 14.058ms 1.144us 12288
4242
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 12.970ms 6.15% 12.970ms 1.045us 12408
4243
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 7.857ms 3.73% 7.857ms 1.279us 6144
4244
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 7.030ms 3.33% 7.030ms 1.144us 6144
4245
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 2.605ms 1.24% 2.605ms 434.242us 6
4246
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4247
+ Self CPU time total: 3.483s
4248
+ Self CUDA time total: 210.821ms
4249
 
4250
 
4251
 
 
4255
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4256
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 3.725s 1668.35% 3.725s 3.725s 1
4259
+ binned_torch 24.05% 896.242ms 100.00% 3.727s 3.727s 0.000us 0.00% 223.307ms 223.307ms 1
4260
+ aten::item 1.73% 64.547ms 27.53% 1.026s 15.123us 0.000us 0.00% 69.633ms 1.026us 67845
4261
+ aten::_local_scalar_dense 6.19% 230.534ms 25.80% 961.495ms 14.172us 69.631ms 31.18% 69.633ms 1.026us 67845
4262
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 69.632ms 31.18% 69.632ms 1.026us 67841
4263
+ aten::floor_divide 5.09% 189.838ms 12.50% 465.764ms 18.952us 30.442ms 13.63% 30.448ms 1.239us 24576
4264
+ aten::bmm 0.01% 247.707us 0.01% 294.697us 49.116us 29.554ms 13.24% 29.554ms 4.926ms 6
4265
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 29.554ms 13.24% 29.554ms 4.926ms 6
4266
+ aten::copy_ 3.50% 130.326ms 8.36% 311.636ms 12.674us 26.718ms 11.97% 26.719ms 1.087us 24588
4267
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 26.715ms 11.96% 26.715ms 1.087us 24581
4268
+ aten::mul 2.92% 108.800ms 5.34% 198.878ms 16.173us 25.547ms 11.44% 25.547ms 2.077us 12297
4269
+ aten::add 3.96% 147.436ms 7.04% 262.447ms 14.081us 22.490ms 10.07% 22.492ms 1.207us 18639
4270
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 22.115ms 9.90% 22.115ms 1.800us 12288
4271
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 16.451ms 7.37% 16.451ms 1.339us 12287
4272
+ aten::remainder 2.81% 104.739ms 4.44% 165.425ms 13.462us 14.805ms 6.63% 14.806ms 1.205us 12288
4273
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 13.992ms 6.27% 13.992ms 1.139us 12287
4274
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 13.166ms 5.90% 13.166ms 1.061us 12407
4275
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 7.819ms 3.50% 7.819ms 1.273us 6144
4276
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.986ms 3.13% 6.986ms 1.137us 6144
4277
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 6.214ms 2.78% 6.214ms 0.998us 6228
4278
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4279
+ Self CPU time total: 3.727s
4280
+ Self CUDA time total: 223.293ms
4281
 
4282
 
4283
 
 
4287
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4288
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 6.919s 1639.48% 6.919s 6.919s 1
4291
+ binned_torch 24.46% 1.695s 100.00% 6.929s 6.929s 0.000us 0.00% 422.036ms 422.036ms 1
4292
+ aten::item 1.67% 115.500ms 26.73% 1.852s 15.089us 0.000us 0.00% 127.102ms 1.035us 122763
4293
+ aten::_local_scalar_dense 5.94% 411.594ms 25.07% 1.737s 14.148us 127.094ms 30.12% 127.102ms 1.035us 122763
4294
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 127.096ms 30.12% 127.096ms 1.035us 122762
4295
+ aten::floor_divide 5.38% 373.026ms 13.30% 921.425ms 18.746us 61.339ms 14.53% 61.343ms 1.248us 49152
4296
+ aten::bmm 0.00% 231.234us 0.00% 280.225us 46.704us 57.287ms 13.57% 57.287ms 9.548ms 6
4297
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 57.287ms 13.57% 57.287ms 9.548ms 6
4298
+ aten::copy_ 3.72% 257.654ms 8.91% 617.063ms 12.553us 53.696ms 12.72% 53.697ms 1.092us 49158
4299
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.694ms 12.72% 53.694ms 1.092us 49154
4300
+ aten::mul 3.13% 217.096ms 5.68% 393.622ms 16.011us 51.639ms 12.24% 51.644ms 2.101us 24585
4301
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.676ms 10.59% 44.676ms 1.818us 24576
4302
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 33.163ms 7.86% 33.163ms 1.349us 24576
4303
+ aten::add 2.81% 194.866ms 4.91% 340.544ms 13.937us 32.585ms 7.72% 32.588ms 1.334us 24435
4304
+ aten::remainder 3.09% 213.993ms 4.85% 335.801ms 13.664us 29.914ms 7.09% 29.918ms 1.217us 24576
4305
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 28.177ms 6.68% 28.177ms 1.147us 24576
4306
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.921ms 6.14% 25.921ms 1.061us 24431
4307
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.786ms 3.74% 15.786ms 1.285us 12288
4308
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.129ms 3.35% 14.129ms 1.150us 12288
4309
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 5.239ms 1.24% 5.239ms 873.180us 6
4310
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4311
+ Self CPU time total: 6.929s
4312
+ Self CUDA time total: 422.014ms
4313
 
4314
 
4315
 
 
4319
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4320
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4321
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4322
+ binned_torch 0.00% 0.000us 0.00% 0.000us 0.000us 7.526s 1690.98% 7.526s 7.526s 1
4323
+ binned_torch 24.06% 1.811s 100.00% 7.528s 7.528s 0.000us 0.00% 445.109ms 445.109ms 1
4324
+ aten::item 1.62% 121.583ms 26.84% 2.020s 14.998us 0.000us 0.00% 138.816ms 1.030us 134715
4325
+ aten::_local_scalar_dense 6.12% 460.388ms 25.22% 1.899s 14.095us 138.805ms 31.19% 138.816ms 1.030us 134715
4326
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 138.805ms 31.19% 138.805ms 1.030us 134707
4327
+ aten::floor_divide 5.25% 395.063ms 12.72% 957.555ms 19.482us 61.331ms 13.78% 61.336ms 1.248us 49152
4328
+ aten::bmm 0.00% 238.536us 0.00% 289.618us 48.270us 57.304ms 12.88% 57.304ms 9.551ms 6
4329
+ ampere_sgemm_128x128_nn 0.00% 0.000us 0.00% 0.000us 0.000us 57.304ms 12.88% 57.304ms 9.551ms 6
4330
+ aten::copy_ 3.62% 272.274ms 8.61% 648.516ms 13.192us 53.873ms 12.10% 53.876ms 1.096us 49158
4331
+ Memcpy DtoD (Device -&gt; Device) 0.00% 0.000us 0.00% 0.000us 0.000us 53.870ms 12.10% 53.870ms 1.096us 49149
4332
+ aten::mul 3.08% 231.551ms 5.44% 409.269ms 16.647us 51.546ms 11.58% 51.551ms 2.097us 24585
4333
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 44.593ms 10.02% 44.593ms 1.814us 24576
4334
+ aten::add 4.08% 306.812ms 7.05% 530.578ms 14.594us 43.966ms 9.88% 43.969ms 1.209us 36357
4335
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 33.107ms 7.44% 33.107ms 1.347us 24573
4336
+ aten::remainder 2.97% 223.921ms 4.70% 353.632ms 14.389us 29.770ms 6.69% 29.775ms 1.211us 24577
4337
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 28.225ms 6.34% 28.225ms 1.149us 24573
4338
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 25.583ms 5.75% 25.583ms 1.047us 24431
4339
+ void at::native::vectorized_elementwise_kernel&lt;2, at... 0.00% 0.000us 0.00% 0.000us 0.000us 15.722ms 3.53% 15.722ms 1.279us 12288
4340
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 14.047ms 3.16% 14.047ms 1.143us 12288
4341
+ void at::native::unrolled_elementwise_kernel&lt;at::nat... 0.00% 0.000us 0.00% 0.000us 0.000us 11.757ms 2.64% 11.757ms 0.986us 11922
4342
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4343
+ Self CPU time total: 7.528s
4344
+ Self CUDA time total: 445.070ms
4345
 
4346
 
4347
  impl wl p50(ms) ok
4348
+ binned_torch cuda_B1_S1024_E2 367.98 True
4349
+ binned_torch cuda_B1_S1024_E4 396.30 True
4350
+ binned_torch cuda_B1_S512_E2 154.35 True
4351
+ binned_torch cuda_B1_S512_E4 195.55 True
4352
+ binned_torch cuda_B4_S1024_E2 1510.09 True
4353
+ binned_torch cuda_B4_S1024_E4 1618.05 True
4354
+ binned_torch cuda_B4_S512_E2 733.47 True
4355
+ binned_torch cuda_B4_S512_E4 787.61 True
4356
  </pre></div>
4357
  <div class="uv-install-logs" id="uv-logs-benchmark">
4358
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4359
  <div class="uv-logs-content" style="display: none;">
4360
+ Installed 37 packages in 322ms
4361
  </div>
4362
  </div>
4363
  <div class="cell-artifacts">
openai_moe/impls/gpt_oss_moe.html CHANGED
@@ -3888,7 +3888,7 @@ body[data-tool="eraser"] .main-content {
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
- Cell: nv | 0.28s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
@@ -3905,7 +3905,7 @@ Cell: nv | 0.28s
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
- <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 19:41:48 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
@@ -3914,7 +3914,7 @@ Cell: nv | 0.28s
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
- | N/A 33C P0 126W / 350W | 0MiB / 46068MiB | 100% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
@@ -3938,7 +3938,7 @@ Cell: nv | 0.28s
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
- Cell: benchmark | 21.43s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
@@ -4042,29 +4042,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E2
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.220ms 197.88% 10.220ms 10.220ms 1
4046
- gpt_oss_experts 16.01% 2.006ms 99.94% 12.523ms 12.523ms 0.000us 0.00% 5.168ms 5.168ms 1
4047
- aten::matmul 0.20% 24.744us 3.78% 473.582us 39.465us 0.000us 0.00% 4.543ms 378.565us 12
4048
- aten::mm 2.31% 289.874us 3.58% 448.838us 37.403us 4.543ms 87.96% 4.543ms 378.565us 12
4049
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.093ms 59.88% 3.093ms 343.626us 9
4050
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.444ms 27.95% 1.444ms 481.227us 3
4051
- aten::mul 1.34% 167.604us 2.25% 281.908us 11.746us 108.865us 2.11% 108.865us 4.536us 24
4052
- aten::add 1.61% 201.238us 3.79% 474.483us 26.360us 102.656us 1.99% 102.656us 5.703us 18
4053
- aten::index 1.69% 212.259us 2.75% 345.042us 28.753us 88.512us 1.71% 88.512us 7.376us 12
4054
- aten::index_add_ 0.46% 58.122us 0.75% 94.202us 15.700us 80.160us 1.55% 80.160us 13.360us 6
4055
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 80.160us 1.55% 80.160us 13.360us 6
4056
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 80.000us 1.55% 80.000us 6.667us 12
4057
- aten::nonzero 2.08% 261.099us 6.37% 797.848us 88.650us 65.246us 1.26% 76.095us 8.455us 9
4058
- aten::clamp 0.95% 119.641us 1.55% 194.514us 16.209us 63.010us 1.22% 63.010us 5.251us 12
4059
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 63.010us 1.22% 63.010us 5.251us 12
4060
- aten::where 0.06% 7.130us 5.02% 629.533us 104.922us 0.000us 0.00% 61.472us 10.245us 6
4061
- aten::nonzero_numpy 0.09% 11.550us 4.97% 622.403us 103.734us 0.000us 0.00% 61.472us 10.245us 6
4062
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 60.800us 1.18% 60.800us 10.133us 6
4063
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.608us 1.10% 56.608us 4.717us 12
4064
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 50.776us 0.98% 50.776us 1.128us 45
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
- Self CPU time total: 12.530ms
4067
- Self CUDA time total: 5.165ms
4068
 
4069
 
4070
 
@@ -4074,29 +4074,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S512_E4
4074
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4075
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 14.281ms 232.73% 14.281ms 14.281ms 1
4078
- gpt_oss_experts 16.85% 2.763ms 99.97% 16.396ms 16.396ms 0.000us 0.00% 6.139ms 6.139ms 1
4079
- aten::matmul 0.27% 44.470us 4.93% 808.156us 33.673us 0.000us 0.00% 5.322ms 221.756us 24
4080
- aten::mm 2.81% 461.070us 4.66% 763.686us 31.820us 5.322ms 86.73% 5.322ms 221.756us 24
4081
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.267ms 85.83% 5.267ms 219.440us 24
4082
- aten::nonzero 2.44% 399.465us 7.84% 1.285ms 85.683us 115.131us 1.88% 137.882us 9.192us 15
4083
- aten::mul 1.86% 305.625us 3.19% 523.892us 10.914us 131.841us 2.15% 131.841us 2.747us 48
4084
- aten::add 2.10% 345.215us 3.57% 585.271us 16.258us 127.810us 2.08% 127.810us 3.550us 36
4085
- aten::where 0.07% 10.792us 7.40% 1.214ms 101.132us 0.000us 0.00% 123.674us 10.306us 12
4086
- aten::nonzero_numpy 0.13% 21.688us 7.33% 1.203ms 100.233us 0.000us 0.00% 123.674us 10.306us 12
4087
- aten::index 2.22% 363.289us 3.85% 631.035us 26.293us 111.423us 1.82% 111.423us 4.643us 24
4088
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 101.762us 1.66% 101.762us 4.240us 24
4089
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 91.773us 1.50% 91.773us 1.055us 87
4090
- aten::clamp 1.29% 211.324us 2.19% 359.818us 14.992us 88.222us 1.44% 88.222us 3.676us 24
4091
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 88.222us 1.44% 88.222us 3.676us 24
4092
- aten::item 0.47% 77.138us 37.50% 6.150ms 85.417us 0.000us 0.00% 75.678us 1.051us 72
4093
- aten::_local_scalar_dense 1.90% 311.363us 37.03% 6.073ms 84.345us 75.678us 1.23% 75.678us 1.051us 72
4094
- aten::index_add_ 0.59% 96.073us 0.99% 162.304us 13.525us 70.526us 1.15% 70.526us 5.877us 12
4095
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 70.526us 1.15% 70.526us 5.877us 12
4096
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 66.368us 1.08% 66.368us 5.531us 12
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
- Self CPU time total: 16.401ms
4099
- Self CUDA time total: 6.136ms
4100
 
4101
 
4102
 
@@ -4106,29 +4106,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E2
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.623ms 150.27% 12.623ms 12.623ms 1
4110
- gpt_oss_experts 13.47% 1.791ms 99.96% 13.283ms 13.283ms 0.000us 0.00% 8.405ms 8.405ms 1
4111
- aten::matmul 0.18% 23.339us 3.36% 446.659us 37.222us 0.000us 0.00% 7.382ms 615.173us 12
4112
- aten::mm 1.99% 264.803us 3.19% 423.320us 35.277us 7.382ms 87.88% 7.382ms 615.173us 12
4113
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.494ms 53.50% 4.494ms 748.960us 6
4114
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.479ms 17.61% 1.479ms 493.131us 3
4115
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.402ms 16.69% 1.402ms 467.413us 3
4116
- aten::mul 1.17% 155.791us 2.03% 269.215us 11.217us 193.439us 2.30% 193.439us 8.060us 24
4117
- aten::add 1.34% 178.665us 2.34% 311.318us 17.295us 184.286us 2.19% 184.286us 10.238us 18
4118
- aten::index_add_ 0.37% 48.760us 0.64% 85.661us 14.277us 167.358us 1.99% 167.358us 27.893us 6
4119
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 167.358us 1.99% 167.358us 27.893us 6
4120
- aten::index 1.43% 189.705us 2.42% 321.187us 26.766us 146.945us 1.75% 146.945us 12.245us 12
4121
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 145.824us 1.74% 145.824us 12.152us 12
4122
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 116.832us 1.39% 116.832us 19.472us 6
4123
- aten::clamp 0.82% 108.995us 1.40% 185.495us 15.458us 109.284us 1.30% 109.284us 9.107us 12
4124
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 109.284us 1.30% 109.284us 9.107us 12
4125
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 103.135us 1.23% 103.135us 8.595us 12
4126
- aten::nonzero 1.83% 243.374us 5.76% 765.236us 85.026us 70.402us 0.84% 81.794us 9.088us 9
4127
- aten::where 0.04% 5.651us 4.63% 615.153us 102.525us 0.000us 0.00% 66.851us 11.142us 6
4128
- aten::nonzero_numpy 0.08% 11.009us 4.59% 609.502us 101.584us 0.000us 0.00% 66.851us 11.142us 6
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
- Self CPU time total: 13.289ms
4131
- Self CUDA time total: 8.400ms
4132
 
4133
 
4134
 
@@ -4138,29 +4138,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B1_S1024_E4
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.138ms 172.84% 18.138ms 18.138ms 1
4142
- gpt_oss_experts 12.76% 2.622ms 99.97% 20.540ms 20.540ms 0.000us 0.00% 10.500ms 10.500ms 1
4143
- aten::matmul 0.22% 44.749us 4.11% 844.232us 35.176us 0.000us 0.00% 9.224ms 384.346us 24
4144
- aten::mm 2.32% 476.088us 3.89% 799.483us 33.312us 9.224ms 87.90% 9.224ms 384.346us 24
4145
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.287ms 59.90% 6.287ms 349.259us 18
4146
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.925ms 27.87% 2.925ms 487.438us 6
4147
- aten::mul 1.51% 311.093us 2.62% 538.833us 11.226us 229.793us 2.19% 229.793us 4.787us 48
4148
- aten::add 1.68% 344.530us 2.88% 592.257us 16.452us 211.009us 2.01% 211.009us 5.861us 36
4149
- aten::index 1.75% 359.041us 3.02% 619.685us 25.820us 205.054us 1.95% 205.054us 8.544us 24
4150
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 164.639us 1.57% 164.639us 6.860us 24
4151
- aten::index_add_ 0.48% 97.780us 0.85% 174.953us 14.579us 157.631us 1.50% 157.631us 13.136us 12
4152
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 157.631us 1.50% 157.631us 13.136us 12
4153
- aten::nonzero 1.89% 388.553us 6.17% 1.268ms 84.506us 122.654us 1.17% 146.847us 9.790us 15
4154
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 145.663us 1.39% 145.663us 12.139us 12
4155
- aten::where 0.05% 10.471us 5.79% 1.190ms 99.134us 0.000us 0.00% 132.128us 11.011us 12
4156
- aten::nonzero_numpy 0.10% 21.340us 5.74% 1.179ms 98.262us 0.000us 0.00% 132.128us 11.011us 12
4157
- aten::clamp 1.02% 209.010us 1.74% 358.311us 14.930us 131.327us 1.25% 131.327us 5.472us 24
4158
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 131.327us 1.25% 131.327us 5.472us 24
4159
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 117.601us 1.12% 117.601us 4.900us 24
4160
- Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 108.253us 1.03% 108.253us 1.244us 87
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
- Self CPU time total: 20.546ms
4163
- Self CUDA time total: 10.495ms
4164
 
4165
 
4166
 
@@ -4170,29 +4170,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E2
4170
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4171
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 20.935ms 121.00% 20.935ms 20.935ms 1
4174
- gpt_oss_experts 7.61% 1.780ms 99.98% 23.376ms 23.376ms 0.000us 0.00% 17.312ms 17.312ms 1
4175
- aten::matmul 0.10% 23.122us 1.96% 458.772us 38.231us 0.000us 0.00% 14.468ms 1.206ms 12
4176
- aten::mm 1.15% 269.268us 1.86% 435.650us 36.304us 14.468ms 83.62% 14.468ms 1.206ms 12
4177
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 8.827ms 51.02% 8.827ms 1.471ms 6
4178
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.632ms 32.55% 5.632ms 938.689us 6
4179
- aten::add 0.79% 184.599us 1.36% 318.590us 17.699us 771.593us 4.46% 771.593us 42.866us 18
4180
- aten::mul 0.68% 158.205us 1.17% 272.787us 11.366us 648.706us 3.75% 648.706us 27.029us 24
4181
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 492.134us 2.84% 492.134us 41.011us 12
4182
- aten::index_add_ 0.22% 51.621us 0.39% 91.292us 15.215us 449.187us 2.60% 449.187us 74.864us 6
4183
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 449.187us 2.60% 449.187us 74.864us 6
4184
- aten::clamp 0.47% 109.062us 0.80% 186.384us 15.532us 328.069us 1.90% 328.069us 27.339us 12
4185
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 328.069us 1.90% 328.069us 27.339us 12
4186
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 298.432us 1.72% 298.432us 49.739us 6
4187
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 279.459us 1.62% 279.459us 46.576us 6
4188
- aten::index 0.79% 185.644us 1.37% 320.365us 26.697us 259.362us 1.50% 259.362us 21.614us 12
4189
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 252.002us 1.46% 252.002us 21.000us 12
4190
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 226.817us 1.31% 226.817us 37.803us 6
4191
- aten::sigmoid 0.16% 37.651us 0.31% 72.093us 12.016us 177.249us 1.02% 177.249us 29.542us 6
4192
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 177.249us 1.02% 177.249us 29.542us 6
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
- Self CPU time total: 23.381ms
4195
- Self CUDA time total: 17.302ms
4196
 
4197
 
4198
 
@@ -4202,29 +4202,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S512_E4
4202
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4203
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4204
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4205
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.710ms 141.76% 24.710ms 24.710ms 1
4206
- gpt_oss_experts 10.14% 2.749ms 99.98% 27.106ms 27.106ms 0.000us 0.00% 17.441ms 17.441ms 1
4207
- aten::matmul 0.17% 45.968us 3.40% 922.464us 38.436us 0.000us 0.00% 15.230ms 634.586us 24
4208
- aten::mm 2.05% 556.479us 3.23% 876.496us 36.521us 15.230ms 87.37% 15.230ms 634.586us 24
4209
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.172ms 52.62% 9.172ms 764.334us 12
4210
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.147ms 18.05% 3.147ms 524.452us 6
4211
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.898ms 16.62% 2.898ms 482.943us 6
4212
- aten::add 1.29% 350.116us 2.26% 613.465us 17.041us 420.321us 2.41% 420.321us 11.676us 36
4213
- aten::mul 1.13% 307.419us 1.97% 533.015us 11.104us 413.571us 2.37% 413.571us 8.616us 48
4214
- aten::index_add_ 0.36% 98.853us 0.63% 169.455us 14.121us 380.323us 2.18% 380.323us 31.694us 12
4215
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 380.323us 2.18% 380.323us 31.694us 12
4216
- aten::index 1.34% 364.187us 2.36% 638.760us 26.615us 342.626us 1.97% 342.626us 14.276us 24
4217
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 337.185us 1.93% 337.185us 14.049us 24
4218
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 278.754us 1.60% 278.754us 23.230us 12
4219
- aten::clamp 0.81% 219.710us 1.37% 372.721us 15.530us 226.367us 1.30% 226.367us 9.432us 24
4220
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 226.367us 1.30% 226.367us 9.432us 24
4221
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 219.298us 1.26% 219.298us 9.137us 24
4222
- aten::nonzero 1.48% 402.204us 4.91% 1.331ms 88.732us 129.571us 0.74% 155.747us 10.383us 15
4223
- aten::where 0.04% 10.572us 4.67% 1.267ms 105.600us 0.000us 0.00% 139.970us 11.664us 12
4224
- aten::nonzero_numpy 0.08% 21.969us 4.64% 1.257ms 104.719us 0.000us 0.00% 139.970us 11.664us 12
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
- Self CPU time total: 27.112ms
4227
- Self CUDA time total: 17.431ms
4228
 
4229
 
4230
 
@@ -4234,29 +4234,29 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E2
4234
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4235
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4236
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4237
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.438ms 109.96% 40.438ms 40.438ms 1
4238
- gpt_oss_experts 4.40% 1.882ms 99.82% 42.728ms 42.728ms 0.000us 0.00% 36.808ms 36.808ms 1
4239
- aten::matmul 0.05% 22.249us 1.02% 438.421us 36.535us 0.000us 0.00% 26.813ms 2.234ms 12
4240
- aten::mm 0.66% 281.965us 0.97% 416.172us 34.681us 26.813ms 72.91% 26.813ms 2.234ms 12
4241
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 26.809ms 72.90% 26.809ms 2.234ms 12
4242
- aten::mul 0.40% 169.436us 0.68% 291.368us 12.140us 2.973ms 8.09% 2.973ms 123.894us 24
4243
- aten::add 0.45% 194.095us 1.09% 466.694us 25.927us 2.399ms 6.52% 2.399ms 133.270us 18
4244
- aten::clamp 0.28% 118.373us 0.48% 205.484us 17.124us 2.385ms 6.49% 2.385ms 198.780us 12
4245
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.385ms 6.49% 2.385ms 198.780us 12
4246
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.983ms 5.39% 1.983ms 165.284us 12
4247
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.623ms 4.41% 1.623ms 135.241us 12
4248
- aten::index_add_ 0.12% 50.121us 0.21% 88.453us 14.742us 929.513us 2.53% 929.513us 154.919us 6
4249
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 929.513us 2.53% 929.513us 154.919us 6
4250
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 775.973us 2.11% 775.973us 129.329us 6
4251
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 743.622us 2.02% 743.622us 123.937us 6
4252
- aten::index 0.44% 190.163us 0.78% 332.417us 27.701us 705.798us 1.92% 705.798us 58.816us 12
4253
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 672.133us 1.83% 672.133us 112.022us 6
4254
- aten::sigmoid 0.10% 42.342us 0.17% 71.992us 11.999us 317.635us 0.86% 317.635us 52.939us 6
4255
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 317.635us 0.86% 317.635us 52.939us 6
4256
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 246.434us 0.67% 246.434us 41.072us 6
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
- Self CPU time total: 42.805ms
4259
- Self CUDA time total: 36.776ms
4260
 
4261
 
4262
 
@@ -4266,40 +4266,40 @@ PROFILE TRACE: gpt_oss_experts | cuda_B4_S1024_E4
4266
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4267
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4268
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4269
- gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.917ms 118.34% 40.917ms 40.917ms 1
4270
- gpt_oss_experts 6.54% 2.832ms 99.99% 43.320ms 43.320ms 0.000us 0.00% 34.594ms 34.594ms 1
4271
- aten::matmul 0.11% 46.003us 2.16% 933.683us 38.903us 0.000us 0.00% 28.640ms 1.193ms 24
4272
- aten::mm 1.27% 551.595us 2.05% 887.680us 36.987us 28.640ms 82.83% 28.640ms 1.193ms 24
4273
- void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.238ms 58.53% 20.238ms 1.349ms 15
4274
- ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.385ms 24.25% 8.385ms 931.701us 9
4275
- aten::add 0.85% 367.713us 1.47% 637.625us 17.712us 1.485ms 4.30% 1.485ms 41.254us 36
4276
- aten::mul 0.73% 317.651us 1.28% 554.606us 11.554us 1.368ms 3.96% 1.368ms 28.495us 48
4277
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 932.164us 2.70% 932.164us 38.840us 24
4278
- aten::index_add_ 0.23% 99.030us 0.39% 170.492us 14.208us 912.225us 2.64% 912.225us 76.019us 12
4279
- void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 912.225us 2.64% 912.225us 76.019us 12
4280
- aten::clamp 0.52% 223.402us 0.90% 389.994us 16.250us 772.775us 2.24% 772.775us 32.199us 24
4281
- void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 772.775us 2.24% 772.775us 32.199us 24
4282
- aten::index 0.84% 365.911us 1.48% 641.837us 26.743us 652.128us 1.89% 652.128us 27.172us 24
4283
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 646.273us 1.87% 646.273us 53.856us 12
4284
- void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 582.113us 1.68% 582.113us 48.509us 12
4285
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 552.993us 1.60% 552.993us 46.083us 12
4286
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 519.810us 1.50% 519.810us 21.659us 24
4287
- aten::sigmoid 0.18% 79.593us 0.31% 135.883us 11.324us 361.471us 1.05% 361.471us 30.123us 12
4288
- void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 361.471us 1.05% 361.471us 30.123us 12
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
- Self CPU time total: 43.326ms
4291
- Self CUDA time total: 34.575ms
4292
 
4293
 
4294
  impl wl p50(ms) ok
4295
- gpt_oss_experts cuda_B1_S1024_E2 3.85 True
4296
- gpt_oss_experts cuda_B1_S1024_E4 5.31 True
4297
- gpt_oss_experts cuda_B1_S512_E2 2.63 True
4298
- gpt_oss_experts cuda_B1_S512_E4 3.93 True
4299
- gpt_oss_experts cuda_B4_S1024_E2 13.24 True
4300
- gpt_oss_experts cuda_B4_S1024_E4 13.36 True
4301
- gpt_oss_experts cuda_B4_S512_E2 6.72 True
4302
- gpt_oss_experts cuda_B4_S512_E4 7.52 True
4303
  </pre></div>
4304
  <div class="uv-install-logs" id="uv-logs-benchmark">
4305
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
@@ -4308,12 +4308,12 @@ gpt_oss_experts cuda_B4_S512_E4 7.52 True
4308
  Updated https://github.com/huggingface/kernels.git (55b7c980e96bf5f747f0e4136be61c0b089ab76c)
4309
  Building kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
4310
  Built kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
4311
- Installed 14 packages in 3ms
4312
  </div>
4313
  </div>
4314
  <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
4315
- Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 6.07it/s]
4316
- Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 12.14it/s]</div>
4317
  <div class="cell-artifacts">
4318
  <h4>Artifacts:</h4>
4319
  <a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
 
3888
  <span onclick="toggleOutput('nv')" style="cursor: pointer;">▼ output</span>
3889
  <span id="uv-indicator-nv" style="cursor: default; opacity: 0.3;">▶ uv-logs</span>
3890
  </span> |
3891
+ Cell: nv | 0.25s
3892
  | <button class="run-btn" onclick="runCell('nv')">▶ run</button>
3893
  <button class="copy-btn" onclick="copyCell('nv')">Copy</button>
3894
  <a href="cells/nv.py" target="_blank" class="raw-btn">Raw</a>
 
3905
  </div>
3906
  </div>
3907
  <div id="output-nv" class="cell-output">
3908
+ <div class="cell-stdout"><pre class="stdout-text">Fri Dec 19 23:00:37 2025
3909
  +-----------------------------------------------------------------------------------------+
3910
  | NVIDIA-SMI 580.105.08 Driver Version: 580.105.08 CUDA Version: 13.0 |
3911
  +-----------------------------------------+------------------------+----------------------+
 
3914
  | | | MIG M. |
3915
  |=========================================+========================+======================|
3916
  | 0 NVIDIA L40S On | 00000000:4D:00.0 Off | 0 |
3917
+ | N/A 40C P0 84W / 350W | 0MiB / 46068MiB | 60% Default |
3918
  | | | N/A |
3919
  +-----------------------------------------+------------------------+----------------------+
3920
 
 
3938
  <span onclick="toggleOutput('benchmark')" style="cursor: pointer;">▼ output</span>
3939
  <span id="uv-indicator-benchmark" onclick="toggleUvLogsFromHeader('benchmark')" style="cursor: pointer;">▶ uv-logs</span>
3940
  </span> |
3941
+ Cell: benchmark | 21.54s
3942
  | <button class="run-btn" onclick="runCell('benchmark')">▶ run</button>
3943
  <button class="copy-btn" onclick="copyCell('benchmark')">Copy</button>
3944
  <a href="cells/benchmark.py" target="_blank" class="raw-btn">Raw</a>
 
4042
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4043
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4044
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4045
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 10.236ms 195.72% 10.236ms 10.236ms 1
4046
+ gpt_oss_experts 16.81% 2.119ms 99.94% 12.602ms 12.602ms 0.000us 0.00% 5.233ms 5.233ms 1
4047
+ aten::matmul 0.21% 26.351us 3.80% 479.051us 39.921us 0.000us 0.00% 4.609ms 384.095us 12
4048
+ aten::mm 2.34% 295.677us 3.59% 452.700us 37.725us 4.609ms 88.13% 4.609ms 384.095us 12
4049
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.132ms 59.89% 3.132ms 348.055us 9
4050
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.470ms 28.11% 1.470ms 490.007us 3
4051
+ aten::mul 1.25% 158.075us 2.09% 263.508us 10.979us 109.535us 2.09% 109.535us 4.564us 24
4052
+ aten::add 1.50% 188.607us 3.77% 475.033us 26.391us 103.232us 1.97% 103.232us 5.735us 18
4053
+ aten::index 1.58% 199.165us 2.64% 332.439us 27.703us 88.193us 1.69% 88.193us 7.349us 12
4054
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 80.832us 1.55% 80.832us 6.736us 12
4055
+ aten::index_add_ 0.43% 54.021us 0.70% 88.353us 14.726us 79.361us 1.52% 79.361us 13.227us 6
4056
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 79.361us 1.52% 79.361us 13.227us 6
4057
+ aten::nonzero 2.14% 269.616us 6.31% 796.127us 88.459us 63.904us 1.22% 74.560us 8.284us 9
4058
+ aten::clamp 0.90% 113.849us 1.52% 191.573us 15.964us 63.523us 1.21% 63.523us 5.294us 12
4059
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 63.523us 1.21% 63.523us 5.294us 12
4060
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 60.767us 1.16% 60.767us 10.128us 6
4061
+ aten::where 0.06% 7.630us 5.01% 631.874us 105.312us 0.000us 0.00% 60.384us 10.064us 6
4062
+ aten::nonzero_numpy 0.10% 12.751us 4.95% 624.244us 104.041us 0.000us 0.00% 60.384us 10.064us 6
4063
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 56.992us 1.09% 56.992us 4.749us 12
4064
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 50.880us 0.97% 50.880us 1.131us 45
4065
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4066
+ Self CPU time total: 12.609ms
4067
+ Self CUDA time total: 5.230ms
4068
 
4069
 
4070
 
 
4074
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4075
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4076
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4077
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 14.418ms 232.27% 14.418ms 14.418ms 1
4078
+ gpt_oss_experts 16.77% 2.777ms 99.97% 16.548ms 16.548ms 0.000us 0.00% 6.210ms 6.210ms 1
4079
+ aten::matmul 0.29% 47.549us 4.87% 805.573us 33.566us 0.000us 0.00% 5.399ms 224.951us 24
4080
+ aten::mm 2.86% 473.570us 4.58% 758.024us 31.584us 5.399ms 86.98% 5.399ms 224.951us 24
4081
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.343ms 86.07% 5.343ms 222.609us 24
4082
+ aten::nonzero 2.46% 406.423us 7.79% 1.290ms 85.983us 112.737us 1.82% 135.233us 9.016us 15
4083
+ aten::mul 1.91% 315.499us 3.27% 541.644us 11.284us 131.458us 2.12% 131.458us 2.739us 48
4084
+ aten::add 2.09% 345.610us 3.58% 592.305us 16.453us 127.137us 2.05% 127.137us 3.532us 36
4085
+ aten::where 0.07% 11.421us 7.35% 1.217ms 101.380us 0.000us 0.00% 121.345us 10.112us 12
4086
+ aten::nonzero_numpy 0.14% 22.419us 7.28% 1.205ms 100.429us 0.000us 0.00% 121.345us 10.112us 12
4087
+ aten::index 2.37% 392.707us 3.98% 658.793us 27.450us 110.560us 1.78% 110.560us 4.607us 24
4088
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 101.089us 1.63% 101.089us 4.212us 24
4089
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 91.523us 1.47% 91.523us 1.052us 87
4090
+ aten::clamp 1.31% 216.727us 2.19% 362.649us 15.110us 87.299us 1.41% 87.299us 3.637us 24
4091
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 87.299us 1.41% 87.299us 3.637us 24
4092
+ aten::item 0.49% 80.385us 37.67% 6.235ms 86.604us 0.000us 0.00% 75.204us 1.044us 72
4093
+ aten::_local_scalar_dense 1.99% 329.728us 37.18% 6.155ms 85.487us 75.204us 1.21% 75.204us 1.044us 72
4094
+ aten::index_add_ 0.56% 93.084us 0.97% 160.623us 13.385us 71.618us 1.15% 71.618us 5.968us 12
4095
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 71.618us 1.15% 71.618us 5.968us 12
4096
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 66.656us 1.07% 66.656us 5.555us 12
4097
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4098
+ Self CPU time total: 16.554ms
4099
+ Self CUDA time total: 6.207ms
4100
 
4101
 
4102
 
 
4106
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4107
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4108
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4109
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 12.715ms 148.98% 12.715ms 12.715ms 1
4110
+ gpt_oss_experts 13.24% 1.769ms 99.96% 13.348ms 13.348ms 0.000us 0.00% 8.540ms 8.540ms 1
4111
+ aten::matmul 0.18% 23.619us 3.35% 447.210us 37.267us 0.000us 0.00% 7.511ms 625.895us 12
4112
+ aten::mm 1.99% 265.185us 3.17% 423.591us 35.299us 7.511ms 88.01% 7.511ms 625.895us 12
4113
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 4.572ms 53.58% 4.572ms 762.082us 6
4114
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 1.497ms 17.54% 1.497ms 498.892us 3
4115
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 1.435ms 16.81% 1.435ms 478.305us 3
4116
+ aten::mul 1.21% 162.011us 2.06% 274.994us 11.458us 197.600us 2.32% 197.600us 8.233us 24
4117
+ aten::add 1.32% 176.183us 2.25% 300.545us 16.697us 188.546us 2.21% 188.546us 10.475us 18
4118
+ aten::index_add_ 0.35% 46.949us 0.64% 86.050us 14.342us 164.416us 1.93% 164.416us 27.403us 6
4119
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 164.416us 1.93% 164.416us 27.403us 6
4120
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 149.442us 1.75% 149.442us 12.453us 12
4121
+ aten::index 1.39% 185.093us 2.39% 318.747us 26.562us 146.144us 1.71% 146.144us 12.179us 12
4122
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 116.287us 1.36% 116.287us 19.381us 6
4123
+ aten::clamp 0.82% 108.858us 1.40% 187.503us 15.625us 110.850us 1.30% 110.850us 9.238us 12
4124
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 110.850us 1.30% 110.850us 9.238us 12
4125
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 104.960us 1.23% 104.960us 8.747us 12
4126
+ aten::nonzero 1.82% 243.314us 5.65% 754.927us 83.881us 69.183us 0.81% 80.703us 8.967us 9
4127
+ aten::where 0.04% 5.842us 4.63% 617.944us 102.991us 0.000us 0.00% 66.080us 11.013us 6
4128
+ aten::nonzero_numpy 0.08% 11.348us 4.58% 612.102us 102.017us 0.000us 0.00% 66.080us 11.013us 6
4129
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4130
+ Self CPU time total: 13.354ms
4131
+ Self CUDA time total: 8.534ms
4132
 
4133
 
4134
 
 
4138
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4139
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4140
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4141
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 18.371ms 173.06% 18.371ms 18.371ms 1
4142
+ gpt_oss_experts 12.78% 2.670ms 99.97% 20.895ms 20.895ms 0.000us 0.00% 10.621ms 10.621ms 1
4143
+ aten::matmul 0.23% 47.482us 3.94% 823.658us 34.319us 0.000us 0.00% 9.337ms 389.038us 24
4144
+ aten::mm 2.27% 474.301us 3.71% 776.176us 32.341us 9.337ms 87.96% 9.337ms 389.038us 24
4145
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 6.375ms 60.06% 6.375ms 354.186us 18
4146
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.948ms 27.77% 2.948ms 491.399us 6
4147
+ aten::mul 1.63% 341.535us 2.71% 565.653us 11.784us 233.052us 2.20% 233.052us 4.855us 48
4148
+ aten::add 1.65% 343.966us 2.82% 589.773us 16.383us 214.333us 2.02% 214.333us 5.954us 36
4149
+ aten::index 1.71% 356.851us 2.95% 617.053us 25.711us 204.352us 1.93% 204.352us 8.515us 24
4150
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 167.774us 1.58% 167.774us 6.991us 24
4151
+ aten::index_add_ 0.45% 94.502us 0.77% 161.933us 13.494us 156.322us 1.47% 156.322us 13.027us 12
4152
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 156.322us 1.47% 156.322us 13.027us 12
4153
+ aten::nonzero 1.91% 398.170us 6.16% 1.287ms 85.805us 122.527us 1.15% 147.135us 9.809us 15
4154
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 146.240us 1.38% 146.240us 12.187us 12
4155
+ aten::clamp 1.04% 217.693us 1.76% 368.516us 15.355us 133.438us 1.26% 133.438us 5.560us 24
4156
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 133.438us 1.26% 133.438us 5.560us 24
4157
+ aten::where 0.05% 11.100us 5.81% 1.214ms 101.204us 0.000us 0.00% 132.577us 11.048us 12
4158
+ aten::nonzero_numpy 0.10% 21.341us 5.76% 1.203ms 100.279us 0.000us 0.00% 132.577us 11.048us 12
4159
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 119.358us 1.12% 119.358us 4.973us 24
4160
+ Memcpy DtoH (Device -&gt; Pinned) 0.00% 0.000us 0.00% 0.000us 0.000us 108.671us 1.02% 108.671us 1.249us 87
4161
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4162
+ Self CPU time total: 20.901ms
4163
+ Self CUDA time total: 10.615ms
4164
 
4165
 
4166
 
 
4170
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4171
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4172
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4173
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 21.147ms 120.13% 21.147ms 21.147ms 1
4174
+ gpt_oss_experts 7.48% 1.759ms 99.98% 23.501ms 23.501ms 0.000us 0.00% 17.613ms 17.613ms 1
4175
+ aten::matmul 0.10% 24.413us 1.93% 452.632us 37.719us 0.000us 0.00% 14.754ms 1.229ms 12
4176
+ aten::mm 1.14% 267.578us 1.82% 428.219us 35.685us 14.754ms 83.81% 14.754ms 1.229ms 12
4177
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.005ms 51.15% 9.005ms 1.501ms 6
4178
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 5.740ms 32.61% 5.740ms 956.646us 6
4179
+ aten::add 0.80% 187.171us 1.34% 315.717us 17.540us 774.145us 4.40% 774.145us 43.008us 18
4180
+ aten::mul 0.68% 160.882us 1.16% 272.615us 11.359us 660.967us 3.75% 660.967us 27.540us 24
4181
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 494.017us 2.81% 494.017us 41.168us 12
4182
+ aten::index_add_ 0.20% 46.930us 0.35% 82.651us 13.775us 446.818us 2.54% 446.818us 74.470us 6
4183
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 446.818us 2.54% 446.818us 74.470us 6
4184
+ aten::clamp 0.49% 114.212us 0.82% 193.704us 16.142us 330.081us 1.88% 330.081us 27.507us 12
4185
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 330.081us 1.88% 330.081us 27.507us 12
4186
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 303.524us 1.72% 303.524us 50.587us 6
4187
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 280.128us 1.59% 280.128us 46.688us 6
4188
+ aten::index 0.79% 185.142us 1.34% 314.927us 26.244us 260.002us 1.48% 260.002us 21.667us 12
4189
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 255.139us 1.45% 255.139us 21.262us 12
4190
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 227.361us 1.29% 227.361us 37.894us 6
4191
+ aten::sigmoid 0.17% 39.139us 0.29% 67.081us 11.180us 175.681us 1.00% 175.681us 29.280us 6
4192
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 175.681us 1.00% 175.681us 29.280us 6
4193
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4194
+ Self CPU time total: 23.507ms
4195
+ Self CUDA time total: 17.603ms
4196
 
4197
 
4198
 
 
4202
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4203
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4204
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4205
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 24.812ms 140.52% 24.812ms 24.812ms 1
4206
+ gpt_oss_experts 10.20% 2.768ms 99.98% 27.139ms 27.139ms 0.000us 0.00% 17.668ms 17.668ms 1
4207
+ aten::matmul 0.17% 47.070us 3.25% 881.530us 36.730us 0.000us 0.00% 15.436ms 643.168us 24
4208
+ aten::mm 1.94% 525.958us 3.07% 834.460us 34.769us 15.436ms 87.42% 15.436ms 643.168us 24
4209
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 9.298ms 52.66% 9.298ms 774.816us 12
4210
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 3.185ms 18.04% 3.185ms 530.803us 6
4211
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_128x64_8... 0.00% 0.000us 0.00% 0.000us 0.000us 2.939ms 16.65% 2.939ms 489.897us 6
4212
+ aten::add 1.32% 358.751us 2.25% 610.989us 16.972us 429.537us 2.43% 429.537us 11.932us 36
4213
+ aten::mul 1.17% 318.045us 2.01% 546.157us 11.378us 419.555us 2.38% 419.555us 8.741us 48
4214
+ aten::index_add_ 0.35% 93.791us 0.61% 165.384us 13.782us 375.712us 2.13% 375.712us 31.309us 12
4215
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 375.712us 2.13% 375.712us 31.309us 12
4216
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 344.672us 1.95% 344.672us 14.361us 24
4217
+ aten::index 1.36% 368.555us 2.35% 637.581us 26.566us 343.779us 1.95% 343.779us 14.324us 24
4218
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 279.524us 1.58% 279.524us 23.294us 12
4219
+ aten::clamp 0.81% 220.839us 1.38% 373.627us 15.568us 232.100us 1.31% 232.100us 9.671us 24
4220
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 232.100us 1.31% 232.100us 9.671us 24
4221
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 222.273us 1.26% 222.273us 9.261us 24
4222
+ aten::nonzero 1.49% 404.133us 4.81% 1.304ms 86.953us 129.285us 0.73% 155.591us 10.373us 15
4223
+ aten::where 0.04% 11.801us 4.54% 1.232ms 102.652us 0.000us 0.00% 140.134us 11.678us 12
4224
+ aten::nonzero_numpy 0.08% 22.919us 4.49% 1.220ms 101.669us 0.000us 0.00% 140.134us 11.678us 12
4225
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4226
+ Self CPU time total: 27.144ms
4227
+ Self CUDA time total: 17.658ms
4228
 
4229
 
4230
 
 
4234
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4235
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4236
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4237
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 40.716ms 109.44% 40.716ms 40.716ms 1
4238
+ gpt_oss_experts 4.15% 1.782ms 99.82% 42.848ms 42.848ms 0.000us 0.00% 37.235ms 37.235ms 1
4239
+ aten::matmul 0.05% 22.008us 1.00% 427.588us 35.632us 0.000us 0.00% 27.249ms 2.271ms 12
4240
+ aten::mm 0.64% 276.436us 0.94% 405.580us 33.798us 27.249ms 73.24% 27.249ms 2.271ms 12
4241
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 27.245ms 73.23% 27.245ms 2.270ms 12
4242
+ aten::mul 0.38% 162.893us 0.65% 277.866us 11.578us 2.967ms 7.97% 2.967ms 123.619us 24
4243
+ aten::add 0.45% 194.205us 1.07% 458.802us 25.489us 2.398ms 6.45% 2.398ms 133.242us 18
4244
+ aten::clamp 0.26% 112.402us 0.45% 191.453us 15.954us 2.384ms 6.41% 2.384ms 198.708us 12
4245
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 2.384ms 6.41% 2.384ms 198.708us 12
4246
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 1.985ms 5.34% 1.985ms 165.412us 12
4247
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 1.626ms 4.37% 1.626ms 135.484us 12
4248
+ aten::index_add_ 0.11% 46.550us 0.19% 83.331us 13.889us 923.493us 2.48% 923.493us 153.916us 6
4249
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 923.493us 2.48% 923.493us 153.916us 6
4250
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 772.550us 2.08% 772.550us 128.758us 6
4251
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 736.421us 1.98% 736.421us 122.737us 6
4252
+ aten::index 0.43% 184.050us 0.73% 314.765us 26.230us 705.700us 1.90% 705.700us 58.808us 12
4253
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 672.068us 1.81% 672.068us 112.011us 6
4254
+ aten::sigmoid 0.09% 40.702us 0.16% 68.501us 11.417us 324.705us 0.87% 324.705us 54.117us 6
4255
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 324.705us 0.87% 324.705us 54.117us 6
4256
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 245.504us 0.66% 245.504us 40.917us 6
4257
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4258
+ Self CPU time total: 42.926ms
4259
+ Self CUDA time total: 37.203ms
4260
 
4261
 
4262
 
 
4266
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4267
  Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
4268
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4269
+ gpt_oss_experts 0.00% 0.000us 0.00% 0.000us 0.000us 41.326ms 117.97% 41.326ms 41.326ms 1
4270
+ gpt_oss_experts 6.48% 2.843ms 99.99% 43.865ms 43.865ms 0.000us 0.00% 35.050ms 35.050ms 1
4271
+ aten::matmul 0.11% 47.091us 2.05% 900.896us 37.537us 0.000us 0.00% 29.086ms 1.212ms 24
4272
+ aten::mm 1.22% 537.124us 1.95% 853.805us 35.575us 29.086ms 83.03% 29.086ms 1.212ms 24
4273
+ void cutlass::Kernel2&lt;cutlass_80_simt_sgemm_256x128_... 0.00% 0.000us 0.00% 0.000us 0.000us 20.524ms 58.59% 20.524ms 1.368ms 15
4274
+ ampere_sgemm_128x64_nn 0.00% 0.000us 0.00% 0.000us 0.000us 8.546ms 24.39% 8.546ms 949.503us 9
4275
+ aten::add 0.83% 362.842us 1.41% 616.516us 17.125us 1.481ms 4.23% 1.481ms 41.132us 36
4276
+ aten::mul 0.72% 316.599us 1.22% 535.905us 11.165us 1.379ms 3.94% 1.379ms 28.736us 48
4277
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 928.582us 2.65% 928.582us 38.691us 24
4278
+ aten::index_add_ 0.22% 95.553us 0.38% 168.433us 14.036us 914.346us 2.61% 914.346us 76.195us 12
4279
+ void at::native::indexFuncLargeIndex&lt;float, long, un... 0.00% 0.000us 0.00% 0.000us 0.000us 914.346us 2.61% 914.346us 76.195us 12
4280
+ aten::clamp 0.51% 224.207us 0.87% 380.890us 15.870us 772.996us 2.21% 772.996us 32.208us 24
4281
+ void at::native::elementwise_kernel&lt;128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 772.996us 2.21% 772.996us 32.208us 24
4282
+ aten::index 0.86% 378.436us 1.47% 642.801us 26.783us 657.670us 1.88% 657.670us 27.403us 24
4283
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 653.092us 1.86% 653.092us 54.424us 12
4284
+ void at::native::vectorized_gather_kernel&lt;16, long&gt;(... 0.00% 0.000us 0.00% 0.000us 0.000us 586.630us 1.67% 586.630us 48.886us 12
4285
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 552.162us 1.58% 552.162us 46.014us 12
4286
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 522.306us 1.49% 522.306us 21.763us 24
4287
+ aten::sigmoid 0.20% 86.392us 0.33% 145.153us 12.096us 354.306us 1.01% 354.306us 29.525us 12
4288
+ void at::native::vectorized_elementwise_kernel&lt;4, at... 0.00% 0.000us 0.00% 0.000us 0.000us 354.306us 1.01% 354.306us 29.525us 12
4289
  ------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
4290
+ Self CPU time total: 43.870ms
4291
+ Self CUDA time total: 35.030ms
4292
 
4293
 
4294
  impl wl p50(ms) ok
4295
+ gpt_oss_experts cuda_B1_S1024_E2 3.87 True
4296
+ gpt_oss_experts cuda_B1_S1024_E4 5.34 True
4297
+ gpt_oss_experts cuda_B1_S512_E2 2.66 True
4298
+ gpt_oss_experts cuda_B1_S512_E4 3.95 True
4299
+ gpt_oss_experts cuda_B4_S1024_E2 13.39 True
4300
+ gpt_oss_experts cuda_B4_S1024_E4 13.41 True
4301
+ gpt_oss_experts cuda_B4_S512_E2 6.80 True
4302
+ gpt_oss_experts cuda_B4_S512_E4 7.53 True
4303
  </pre></div>
4304
  <div class="uv-install-logs" id="uv-logs-benchmark">
4305
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
 
4308
  Updated https://github.com/huggingface/kernels.git (55b7c980e96bf5f747f0e4136be61c0b089ab76c)
4309
  Building kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
4310
  Built kernels @ git+https://github.com/huggingface/kernels.git@55b7c980e96bf5f747f0e4136be61c0b089ab76c
4311
+ Installed 14 packages in 4ms
4312
  </div>
4313
  </div>
4314
  <div class="cell-stderr">Fetching 6 files: 0%| | 0/6 [00:00&lt;?, ?it/s]
4315
+ Fetching 6 files: 50%|█████ | 3/6 [00:00&lt;00:00, 5.85it/s]
4316
+ Fetching 6 files: 100%|██████████| 6/6 [00:00&lt;00:00, 11.70it/s]</div>
4317
  <div class="cell-artifacts">
4318
  <h4>Artifacts:</h4>
4319
  <a href="artifacts/benchmark/openai_moe.jsonl" class="artifact" target="_blank">openai_moe.jsonl</a>
openai_moe/results/artifacts/combine/latency.svg CHANGED

Git LFS Details

  • SHA256: 3090485b23d0740dc54ec975ab4d53494c6243ac5b87df898966ffdc9bc67256
  • Pointer size: 130 Bytes
  • Size of remote file: 20.3 kB

Git LFS Details

  • SHA256: 02555fc2deb5b3ebf32c8e5fb2da2aa4a52c2220d481a1dd86dbee6b6edcbda6
  • Pointer size: 130 Bytes
  • Size of remote file: 21.9 kB
openai_moe/results/combined_results.html CHANGED
@@ -3889,7 +3889,7 @@ body[data-tool="eraser"] .main-content {
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
- <dc:date>2025-12-19T19:55:39.293722</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
@@ -3908,294 +3908,320 @@ body[data-tool="eraser"] .main-content {
3908
  </g>
3909
  <g id="axes--1" class="axes">
3910
  <g id="patch_2">
3911
- <path d="M 57.17 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.17 26.88 L 57.17 468.317269 z " style="fill: none" />
3912
  </g>
3913
  <g id="matplotlib.axis_1">
3914
  <g id="xtick_1">
3915
  <g id="grid-x--1" class="grid grid-x">
3916
- <path d="M 93.01531 468.317269 L 93.01531 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3917
  </g>
3918
  <g id="line2d_1">
3919
  <defs>
3920
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
3921
  </defs>
3922
  <g>
3923
- <use ns4:href="#mafb3703e5b" x="93.01531" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3924
  </g>
3925
  </g>
3926
  <g id="text_1">
3927
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.817431 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
3928
  </g>
3929
  </g>
3930
  <g id="xtick_2">
3931
  <g id="grid-x--2" class="grid grid-x">
3932
- <path d="M 195.430481 468.317269 L 195.430481 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3933
  </g>
3934
  <g id="line2d_2">
3935
  <g>
3936
- <use ns4:href="#mafb3703e5b" x="195.430481" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3937
  </g>
3938
  </g>
3939
  <g id="text_2">
3940
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.232602 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
3941
  </g>
3942
  </g>
3943
  <g id="xtick_3">
3944
  <g id="grid-x--3" class="grid grid-x">
3945
- <path d="M 297.845652 468.317269 L 297.845652 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3946
  </g>
3947
  <g id="line2d_3">
3948
  <g>
3949
- <use ns4:href="#mafb3703e5b" x="297.845652" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3950
  </g>
3951
  </g>
3952
  <g id="text_3">
3953
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.39829 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
3954
  </g>
3955
  </g>
3956
  <g id="xtick_4">
3957
  <g id="grid-x--4" class="grid grid-x">
3958
- <path d="M 400.260823 468.317269 L 400.260823 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3959
  </g>
3960
  <g id="line2d_4">
3961
  <g>
3962
- <use ns4:href="#mafb3703e5b" x="400.260823" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3963
  </g>
3964
  </g>
3965
  <g id="text_4">
3966
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.813461 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
3967
  </g>
3968
  </g>
3969
  <g id="xtick_5">
3970
  <g id="grid-x--5" class="grid grid-x">
3971
- <path d="M 502.675995 468.317269 L 502.675995 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3972
  </g>
3973
  <g id="line2d_5">
3974
  <g>
3975
- <use ns4:href="#mafb3703e5b" x="502.675995" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3976
  </g>
3977
  </g>
3978
  <g id="text_5">
3979
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.478116 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
3980
  </g>
3981
  </g>
3982
  <g id="xtick_6">
3983
  <g id="grid-x--6" class="grid grid-x">
3984
- <path d="M 605.091166 468.317269 L 605.091166 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3985
  </g>
3986
  <g id="line2d_6">
3987
  <g>
3988
- <use ns4:href="#mafb3703e5b" x="605.091166" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3989
  </g>
3990
  </g>
3991
  <g id="text_6">
3992
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.893287 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
3993
  </g>
3994
  </g>
3995
  <g id="xtick_7">
3996
  <g id="grid-x--7" class="grid grid-x">
3997
- <path d="M 707.506337 468.317269 L 707.506337 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3998
  </g>
3999
  <g id="line2d_7">
4000
  <g>
4001
- <use ns4:href="#mafb3703e5b" x="707.506337" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4002
  </g>
4003
  </g>
4004
  <g id="text_7">
4005
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.058975 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
4006
  </g>
4007
  </g>
4008
  <g id="xtick_8">
4009
  <g id="grid-x--8" class="grid grid-x">
4010
- <path d="M 809.921508 468.317269 L 809.921508 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4011
  </g>
4012
  <g id="line2d_8">
4013
  <g>
4014
- <use ns4:href="#mafb3703e5b" x="809.921508" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4015
  </g>
4016
  </g>
4017
  <g id="text_8">
4018
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.474146 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
4019
  </g>
4020
  </g>
4021
  <g id="label--x" class="xlabel">
4022
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="562.556245" transform="rotate(-0 451.468409 562.556245)">Workload</text>
4023
  </g>
4024
  </g>
4025
  <g id="matplotlib.axis_2">
4026
  <g id="ytick_1">
4027
  <g id="grid-y--2" class="grid grid-y">
4028
- <path d="M 57.17 448.894453 L 845.766818 448.894453 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4029
  </g>
4030
  <g id="line2d_9">
4031
  <defs>
4032
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4033
  </defs>
4034
  <g>
4035
- <use ns4:href="#m0fca2865ba" x="57.17" y="448.894453" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="452.693672" transform="rotate(-0 50.17 452.693672)">0</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_2">
4043
  <g id="grid-y--3" class="grid grid-y">
4044
- <path d="M 57.17 387.738866 L 845.766818 387.738866 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
- <use ns4:href="#m0fca2865ba" x="57.17" y="387.738866" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="391.538085" transform="rotate(-0 50.17 391.538085)">250</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_3">
4056
  <g id="grid-y--4" class="grid grid-y">
4057
- <path d="M 57.17 326.583279 L 845.766818 326.583279 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
- <use ns4:href="#m0fca2865ba" x="57.17" y="326.583279" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="330.382498" transform="rotate(-0 50.17 330.382498)">500</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_4">
4069
  <g id="grid-y--5" class="grid grid-y">
4070
- <path d="M 57.17 265.427692 L 845.766818 265.427692 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
- <use ns4:href="#m0fca2865ba" x="57.17" y="265.427692" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="269.226911" transform="rotate(-0 50.17 269.226911)">750</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_5">
4082
  <g id="grid-y--6" class="grid grid-y">
4083
- <path d="M 57.17 204.272105 L 845.766818 204.272105 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
- <use ns4:href="#m0fca2865ba" x="57.17" y="204.272105" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="208.071324" transform="rotate(-0 50.17 208.071324)">1000</text>
4092
  </g>
4093
  </g>
4094
  <g id="ytick_6">
4095
  <g id="grid-y--7" class="grid grid-y">
4096
- <path d="M 57.17 143.116518 L 845.766818 143.116518 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4097
  </g>
4098
  <g id="line2d_14">
4099
  <g>
4100
- <use ns4:href="#m0fca2865ba" x="57.17" y="143.116518" style="stroke: #000000; stroke-width: 0.8" />
4101
  </g>
4102
  </g>
4103
  <g id="text_14">
4104
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="146.915736" transform="rotate(-0 50.17 146.915736)">1250</text>
4105
  </g>
4106
  </g>
4107
  <g id="ytick_7">
4108
  <g id="grid-y--8" class="grid grid-y">
4109
- <path d="M 57.17 81.960931 L 845.766818 81.960931 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4110
  </g>
4111
  <g id="line2d_15">
4112
  <g>
4113
- <use ns4:href="#m0fca2865ba" x="57.17" y="81.960931" style="stroke: #000000; stroke-width: 0.8" />
4114
  </g>
4115
  </g>
4116
  <g id="text_15">
4117
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="85.760149" transform="rotate(-0 50.17 85.760149)">1500</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4118
  </g>
4119
  </g>
4120
  <g id="label--y" class="ylabel">
4121
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.640312" y="247.598635" transform="rotate(-90 18.640312 247.598635)">Latency P50 (ms)</text>
4122
  </g>
4123
  </g>
4124
  <g id="series--binned-torch" class="series">
4125
- <path d="M 93.01531 410.178286 L 195.430481 397.766778 L 297.845652 356.453725 L 400.260823 348.866756 L 502.675995 260.623561 L 605.091166 249.051311 L 707.506337 77.923227 L 809.921508 46.94533 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4126
  <defs>
4127
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4128
  </defs>
4129
- <g clip-path="url(#p5307ca50d8)">
4130
- <use ns4:href="#md7efaf3aec" x="93.01531" y="410.178286" style="fill: #1f77b4; stroke: #1f77b4" />
4131
- <use ns4:href="#md7efaf3aec" x="195.430481" y="397.766778" style="fill: #1f77b4; stroke: #1f77b4" />
4132
- <use ns4:href="#md7efaf3aec" x="297.845652" y="356.453725" style="fill: #1f77b4; stroke: #1f77b4" />
4133
- <use ns4:href="#md7efaf3aec" x="400.260823" y="348.866756" style="fill: #1f77b4; stroke: #1f77b4" />
4134
- <use ns4:href="#md7efaf3aec" x="502.675995" y="260.623561" style="fill: #1f77b4; stroke: #1f77b4" />
4135
- <use ns4:href="#md7efaf3aec" x="605.091166" y="249.051311" style="fill: #1f77b4; stroke: #1f77b4" />
4136
- <use ns4:href="#md7efaf3aec" x="707.506337" y="77.923227" style="fill: #1f77b4; stroke: #1f77b4" />
4137
- <use ns4:href="#md7efaf3aec" x="809.921508" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4138
  </g>
4139
  </g>
4140
  <g id="series--gpt-oss-experts" class="series">
4141
- <path d="M 93.01531 448.251939 L 195.430481 447.932519 L 297.845652 447.952742 L 400.260823 447.595994 L 502.675995 447.251251 L 605.091166 447.055342 L 707.506337 445.656252 L 809.921508 445.625657 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4142
  <defs>
4143
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4144
  </defs>
4145
- <g clip-path="url(#p5307ca50d8)">
4146
- <use ns4:href="#m9b8c54d372" x="93.01531" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4147
- <use ns4:href="#m9b8c54d372" x="195.430481" y="447.932519" style="fill: #ff7f0e; stroke: #ff7f0e" />
4148
- <use ns4:href="#m9b8c54d372" x="297.845652" y="447.952742" style="fill: #ff7f0e; stroke: #ff7f0e" />
4149
- <use ns4:href="#m9b8c54d372" x="400.260823" y="447.595994" style="fill: #ff7f0e; stroke: #ff7f0e" />
4150
- <use ns4:href="#m9b8c54d372" x="502.675995" y="447.251251" style="fill: #ff7f0e; stroke: #ff7f0e" />
4151
- <use ns4:href="#m9b8c54d372" x="605.091166" y="447.055342" style="fill: #ff7f0e; stroke: #ff7f0e" />
4152
- <use ns4:href="#m9b8c54d372" x="707.506337" y="445.656252" style="fill: #ff7f0e; stroke: #ff7f0e" />
4153
- <use ns4:href="#m9b8c54d372" x="809.921508" y="445.625657" style="fill: #ff7f0e; stroke: #ff7f0e" />
4154
  </g>
4155
  </g>
4156
  <g id="patch_3">
4157
- <path d="M 57.17 468.317269 L 57.17 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4158
  </g>
4159
  <g id="patch_4">
4160
  <path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4161
  </g>
4162
  <g id="patch_5">
4163
- <path d="M 57.17 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4164
  </g>
4165
  <g id="patch_6">
4166
- <path d="M 57.17 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4167
  </g>
4168
- <g id="text_16">
4169
- <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="20.88" transform="rotate(-0 451.468409 20.88)">Attention Implementation Latency</text>
4170
  </g>
4171
  <g id="legend" class="legend">
4172
  <g id="patch_7">
4173
- <path d="M 64.17 64.7925 L 176.96375 64.7925 Q 178.96375 64.7925 178.96375 62.7925 L 178.96375 33.88 Q 178.96375 31.88 176.96375 31.88 L 64.17 31.88 Q 62.17 31.88 62.17 33.88 L 62.17 62.7925 Q 62.17 64.7925 64.17 64.7925 L 64.17 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4174
  </g>
4175
- <g id="line2d_16">
4176
- <path d="M 66.17 39.978438 L 76.17 39.978438 L 86.17 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4177
  <g>
4178
- <use ns4:href="#md7efaf3aec" x="76.17" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4179
  </g>
4180
  </g>
4181
  <g id="legend-label--binned-torch" class="legend">
4182
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="43.478438" transform="rotate(-0 94.17 43.478438)">binned_torch</text>
4183
  </g>
4184
- <g id="line2d_17">
4185
- <path d="M 66.17 54.934687 L 76.17 54.934687 L 86.17 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4186
  <g>
4187
- <use ns4:href="#m9b8c54d372" x="76.17" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4188
  </g>
4189
  </g>
4190
  <g id="legend-label--gpt-oss-experts" class="legend">
4191
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="58.434687" transform="rotate(-0 94.17 58.434687)">gpt_oss_experts</text>
4192
  </g>
4193
  </g>
4194
  </g>
4195
  </g>
4196
  <defs>
4197
- <clipPath id="p5307ca50d8">
4198
- <rect x="57.17" y="26.88" width="788.596818" height="441.437269" />
4199
  </clipPath>
4200
  </defs>
4201
  </svg>
@@ -4208,7 +4234,7 @@ body[data-tool="eraser"] .main-content {
4208
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4209
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4210
  </span> |
4211
- Cell: combine | 4.43s
4212
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4213
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4214
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
@@ -4297,22 +4323,22 @@ Summary: 2 found, 0 skipped, 0 missing
4297
  COMBINED BENCHMARK SUMMARY
4298
 
4299
  impl wl p50(ms) ok
4300
- binned_torch cuda_B1_S1024_E2 377.89 True
4301
- binned_torch cuda_B1_S1024_E4 408.91 True
4302
- binned_torch cuda_B1_S512_E2 158.27 True
4303
- binned_torch cuda_B1_S512_E4 209.01 True
4304
- binned_torch cuda_B4_S1024_E2 1516.51 True
4305
- binned_torch cuda_B4_S1024_E4 1643.14 True
4306
- binned_torch cuda_B4_S512_E2 769.64 True
4307
- binned_torch cuda_B4_S512_E4 816.95 True
4308
- gpt_oss_experts cuda_B1_S1024_E2 3.85 True
4309
- gpt_oss_experts cuda_B1_S1024_E4 5.31 True
4310
- gpt_oss_experts cuda_B1_S512_E2 2.63 True
4311
- gpt_oss_experts cuda_B1_S512_E4 3.93 True
4312
- gpt_oss_experts cuda_B4_S1024_E2 13.24 True
4313
- gpt_oss_experts cuda_B4_S1024_E4 13.36 True
4314
- gpt_oss_experts cuda_B4_S512_E2 6.72 True
4315
- gpt_oss_experts cuda_B4_S512_E4 7.52 True
4316
 
4317
  GENERATING COMBINED VISUALIZATION
4318
 
@@ -4332,7 +4358,7 @@ Implementations included:
4332
  <div class="uv-install-logs" id="uv-logs-combine">
4333
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4334
  <div class="uv-logs-content" style="display: none;">
4335
- Installed 37 packages in 205ms
4336
  </div>
4337
  </div>
4338
  <div class="cell-artifacts">
@@ -4345,7 +4371,7 @@ Installed 37 packages in 205ms
4345
  <rdf:RDF>
4346
  <ns2:Work>
4347
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4348
- <dc:date>2025-12-19T19:55:39.293722</dc:date>
4349
  <dc:format>image/svg+xml</dc:format>
4350
  <dc:creator>
4351
  <ns2:Agent>
@@ -4364,294 +4390,320 @@ Installed 37 packages in 205ms
4364
  </g>
4365
  <g id="axes--1" class="axes">
4366
  <g id="patch_2">
4367
- <path d="M 57.17 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.17 26.88 L 57.17 468.317269 z " style="fill: none" />
4368
  </g>
4369
  <g id="matplotlib.axis_1">
4370
  <g id="xtick_1">
4371
  <g id="grid-x--1" class="grid grid-x">
4372
- <path d="M 93.01531 468.317269 L 93.01531 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4373
  </g>
4374
  <g id="line2d_1">
4375
  <defs>
4376
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
4377
  </defs>
4378
  <g>
4379
- <use ns4:href="#mafb3703e5b" x="93.01531" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4380
  </g>
4381
  </g>
4382
  <g id="text_1">
4383
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.817431 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
4384
  </g>
4385
  </g>
4386
  <g id="xtick_2">
4387
  <g id="grid-x--2" class="grid grid-x">
4388
- <path d="M 195.430481 468.317269 L 195.430481 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4389
  </g>
4390
  <g id="line2d_2">
4391
  <g>
4392
- <use ns4:href="#mafb3703e5b" x="195.430481" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4393
  </g>
4394
  </g>
4395
  <g id="text_2">
4396
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.232602 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
4397
  </g>
4398
  </g>
4399
  <g id="xtick_3">
4400
  <g id="grid-x--3" class="grid grid-x">
4401
- <path d="M 297.845652 468.317269 L 297.845652 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4402
  </g>
4403
  <g id="line2d_3">
4404
  <g>
4405
- <use ns4:href="#mafb3703e5b" x="297.845652" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4406
  </g>
4407
  </g>
4408
  <g id="text_3">
4409
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.39829 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
4410
  </g>
4411
  </g>
4412
  <g id="xtick_4">
4413
  <g id="grid-x--4" class="grid grid-x">
4414
- <path d="M 400.260823 468.317269 L 400.260823 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4415
  </g>
4416
  <g id="line2d_4">
4417
  <g>
4418
- <use ns4:href="#mafb3703e5b" x="400.260823" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4419
  </g>
4420
  </g>
4421
  <g id="text_4">
4422
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.813461 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
4423
  </g>
4424
  </g>
4425
  <g id="xtick_5">
4426
  <g id="grid-x--5" class="grid grid-x">
4427
- <path d="M 502.675995 468.317269 L 502.675995 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4428
  </g>
4429
  <g id="line2d_5">
4430
  <g>
4431
- <use ns4:href="#mafb3703e5b" x="502.675995" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4432
  </g>
4433
  </g>
4434
  <g id="text_5">
4435
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.478116 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
4436
  </g>
4437
  </g>
4438
  <g id="xtick_6">
4439
  <g id="grid-x--6" class="grid grid-x">
4440
- <path d="M 605.091166 468.317269 L 605.091166 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4441
  </g>
4442
  <g id="line2d_6">
4443
  <g>
4444
- <use ns4:href="#mafb3703e5b" x="605.091166" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4445
  </g>
4446
  </g>
4447
  <g id="text_6">
4448
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.893287 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
4449
  </g>
4450
  </g>
4451
  <g id="xtick_7">
4452
  <g id="grid-x--7" class="grid grid-x">
4453
- <path d="M 707.506337 468.317269 L 707.506337 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4454
  </g>
4455
  <g id="line2d_7">
4456
  <g>
4457
- <use ns4:href="#mafb3703e5b" x="707.506337" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4458
  </g>
4459
  </g>
4460
  <g id="text_7">
4461
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.058975 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
4462
  </g>
4463
  </g>
4464
  <g id="xtick_8">
4465
  <g id="grid-x--8" class="grid grid-x">
4466
- <path d="M 809.921508 468.317269 L 809.921508 26.88 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4467
  </g>
4468
  <g id="line2d_8">
4469
  <g>
4470
- <use ns4:href="#mafb3703e5b" x="809.921508" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4471
  </g>
4472
  </g>
4473
  <g id="text_8">
4474
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.474146 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
4475
  </g>
4476
  </g>
4477
  <g id="label--x" class="xlabel">
4478
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="562.556245" transform="rotate(-0 451.468409 562.556245)">Workload</text>
4479
  </g>
4480
  </g>
4481
  <g id="matplotlib.axis_2">
4482
  <g id="ytick_1">
4483
  <g id="grid-y--2" class="grid grid-y">
4484
- <path d="M 57.17 448.894453 L 845.766818 448.894453 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4485
  </g>
4486
  <g id="line2d_9">
4487
  <defs>
4488
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4489
  </defs>
4490
  <g>
4491
- <use ns4:href="#m0fca2865ba" x="57.17" y="448.894453" style="stroke: #000000; stroke-width: 0.8" />
4492
  </g>
4493
  </g>
4494
  <g id="text_9">
4495
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="452.693672" transform="rotate(-0 50.17 452.693672)">0</text>
4496
  </g>
4497
  </g>
4498
  <g id="ytick_2">
4499
  <g id="grid-y--3" class="grid grid-y">
4500
- <path d="M 57.17 387.738866 L 845.766818 387.738866 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4501
  </g>
4502
  <g id="line2d_10">
4503
  <g>
4504
- <use ns4:href="#m0fca2865ba" x="57.17" y="387.738866" style="stroke: #000000; stroke-width: 0.8" />
4505
  </g>
4506
  </g>
4507
  <g id="text_10">
4508
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="391.538085" transform="rotate(-0 50.17 391.538085)">250</text>
4509
  </g>
4510
  </g>
4511
  <g id="ytick_3">
4512
  <g id="grid-y--4" class="grid grid-y">
4513
- <path d="M 57.17 326.583279 L 845.766818 326.583279 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4514
  </g>
4515
  <g id="line2d_11">
4516
  <g>
4517
- <use ns4:href="#m0fca2865ba" x="57.17" y="326.583279" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_11">
4521
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="330.382498" transform="rotate(-0 50.17 330.382498)">500</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_4">
4525
  <g id="grid-y--5" class="grid grid-y">
4526
- <path d="M 57.17 265.427692 L 845.766818 265.427692 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_12">
4529
  <g>
4530
- <use ns4:href="#m0fca2865ba" x="57.17" y="265.427692" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_12">
4534
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="269.226911" transform="rotate(-0 50.17 269.226911)">750</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_5">
4538
  <g id="grid-y--6" class="grid grid-y">
4539
- <path d="M 57.17 204.272105 L 845.766818 204.272105 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_13">
4542
  <g>
4543
- <use ns4:href="#m0fca2865ba" x="57.17" y="204.272105" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_13">
4547
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="208.071324" transform="rotate(-0 50.17 208.071324)">1000</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_6">
4551
  <g id="grid-y--7" class="grid grid-y">
4552
- <path d="M 57.17 143.116518 L 845.766818 143.116518 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_14">
4555
  <g>
4556
- <use ns4:href="#m0fca2865ba" x="57.17" y="143.116518" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_14">
4560
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="146.915736" transform="rotate(-0 50.17 146.915736)">1250</text>
4561
  </g>
4562
  </g>
4563
  <g id="ytick_7">
4564
  <g id="grid-y--8" class="grid grid-y">
4565
- <path d="M 57.17 81.960931 L 845.766818 81.960931 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4566
  </g>
4567
  <g id="line2d_15">
4568
  <g>
4569
- <use ns4:href="#m0fca2865ba" x="57.17" y="81.960931" style="stroke: #000000; stroke-width: 0.8" />
4570
  </g>
4571
  </g>
4572
  <g id="text_15">
4573
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.17" y="85.760149" transform="rotate(-0 50.17 85.760149)">1500</text>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4574
  </g>
4575
  </g>
4576
  <g id="label--y" class="ylabel">
4577
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.640312" y="247.598635" transform="rotate(-90 18.640312 247.598635)">Latency P50 (ms)</text>
4578
  </g>
4579
  </g>
4580
  <g id="series--binned-torch" class="series">
4581
- <path d="M 93.01531 410.178286 L 195.430481 397.766778 L 297.845652 356.453725 L 400.260823 348.866756 L 502.675995 260.623561 L 605.091166 249.051311 L 707.506337 77.923227 L 809.921508 46.94533 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4582
  <defs>
4583
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4584
  </defs>
4585
- <g clip-path="url(#p5307ca50d8)">
4586
- <use ns4:href="#md7efaf3aec" x="93.01531" y="410.178286" style="fill: #1f77b4; stroke: #1f77b4" />
4587
- <use ns4:href="#md7efaf3aec" x="195.430481" y="397.766778" style="fill: #1f77b4; stroke: #1f77b4" />
4588
- <use ns4:href="#md7efaf3aec" x="297.845652" y="356.453725" style="fill: #1f77b4; stroke: #1f77b4" />
4589
- <use ns4:href="#md7efaf3aec" x="400.260823" y="348.866756" style="fill: #1f77b4; stroke: #1f77b4" />
4590
- <use ns4:href="#md7efaf3aec" x="502.675995" y="260.623561" style="fill: #1f77b4; stroke: #1f77b4" />
4591
- <use ns4:href="#md7efaf3aec" x="605.091166" y="249.051311" style="fill: #1f77b4; stroke: #1f77b4" />
4592
- <use ns4:href="#md7efaf3aec" x="707.506337" y="77.923227" style="fill: #1f77b4; stroke: #1f77b4" />
4593
- <use ns4:href="#md7efaf3aec" x="809.921508" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4594
  </g>
4595
  </g>
4596
  <g id="series--gpt-oss-experts" class="series">
4597
- <path d="M 93.01531 448.251939 L 195.430481 447.932519 L 297.845652 447.952742 L 400.260823 447.595994 L 502.675995 447.251251 L 605.091166 447.055342 L 707.506337 445.656252 L 809.921508 445.625657 " clip-path="url(#p5307ca50d8)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4598
  <defs>
4599
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4600
  </defs>
4601
- <g clip-path="url(#p5307ca50d8)">
4602
- <use ns4:href="#m9b8c54d372" x="93.01531" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4603
- <use ns4:href="#m9b8c54d372" x="195.430481" y="447.932519" style="fill: #ff7f0e; stroke: #ff7f0e" />
4604
- <use ns4:href="#m9b8c54d372" x="297.845652" y="447.952742" style="fill: #ff7f0e; stroke: #ff7f0e" />
4605
- <use ns4:href="#m9b8c54d372" x="400.260823" y="447.595994" style="fill: #ff7f0e; stroke: #ff7f0e" />
4606
- <use ns4:href="#m9b8c54d372" x="502.675995" y="447.251251" style="fill: #ff7f0e; stroke: #ff7f0e" />
4607
- <use ns4:href="#m9b8c54d372" x="605.091166" y="447.055342" style="fill: #ff7f0e; stroke: #ff7f0e" />
4608
- <use ns4:href="#m9b8c54d372" x="707.506337" y="445.656252" style="fill: #ff7f0e; stroke: #ff7f0e" />
4609
- <use ns4:href="#m9b8c54d372" x="809.921508" y="445.625657" style="fill: #ff7f0e; stroke: #ff7f0e" />
4610
  </g>
4611
  </g>
4612
  <g id="patch_3">
4613
- <path d="M 57.17 468.317269 L 57.17 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4614
  </g>
4615
  <g id="patch_4">
4616
  <path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4617
  </g>
4618
  <g id="patch_5">
4619
- <path d="M 57.17 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4620
  </g>
4621
  <g id="patch_6">
4622
- <path d="M 57.17 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4623
  </g>
4624
- <g id="text_16">
4625
- <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.468409" y="20.88" transform="rotate(-0 451.468409 20.88)">Attention Implementation Latency</text>
4626
  </g>
4627
  <g id="legend" class="legend">
4628
  <g id="patch_7">
4629
- <path d="M 64.17 64.7925 L 176.96375 64.7925 Q 178.96375 64.7925 178.96375 62.7925 L 178.96375 33.88 Q 178.96375 31.88 176.96375 31.88 L 64.17 31.88 Q 62.17 31.88 62.17 33.88 L 62.17 62.7925 Q 62.17 64.7925 64.17 64.7925 L 64.17 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4630
  </g>
4631
- <g id="line2d_16">
4632
- <path d="M 66.17 39.978438 L 76.17 39.978438 L 86.17 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4633
  <g>
4634
- <use ns4:href="#md7efaf3aec" x="76.17" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4635
  </g>
4636
  </g>
4637
  <g id="legend-label--binned-torch" class="legend">
4638
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="43.478438" transform="rotate(-0 94.17 43.478438)">binned_torch</text>
4639
  </g>
4640
- <g id="line2d_17">
4641
- <path d="M 66.17 54.934687 L 76.17 54.934687 L 86.17 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4642
  <g>
4643
- <use ns4:href="#m9b8c54d372" x="76.17" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4644
  </g>
4645
  </g>
4646
  <g id="legend-label--gpt-oss-experts" class="legend">
4647
- <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.17" y="58.434687" transform="rotate(-0 94.17 58.434687)">gpt_oss_experts</text>
4648
  </g>
4649
  </g>
4650
  </g>
4651
  </g>
4652
  <defs>
4653
- <clipPath id="p5307ca50d8">
4654
- <rect x="57.17" y="26.88" width="788.596818" height="441.437269" />
4655
  </clipPath>
4656
  </defs>
4657
  </svg>
 
3889
  <rdf:RDF>
3890
  <ns2:Work>
3891
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
3892
+ <dc:date>2025-12-19T23:02:40.893386</dc:date>
3893
  <dc:format>image/svg+xml</dc:format>
3894
  <dc:creator>
3895
  <ns2:Agent>
 
3908
  </g>
3909
  <g id="axes--1" class="axes">
3910
  <g id="patch_2">
3911
+ <path d="M 57.26 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.26 26.88 L 57.26 468.317269 z " style="fill: none" />
3912
  </g>
3913
  <g id="matplotlib.axis_1">
3914
  <g id="xtick_1">
3915
  <g id="grid-x--1" class="grid grid-x">
3916
+ <path d="M 93.101219 468.317269 L 93.101219 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3917
  </g>
3918
  <g id="line2d_1">
3919
  <defs>
3920
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
3921
  </defs>
3922
  <g>
3923
+ <use ns4:href="#mafb3703e5b" x="93.101219" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3924
  </g>
3925
  </g>
3926
  <g id="text_1">
3927
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.90334 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
3928
  </g>
3929
  </g>
3930
  <g id="xtick_2">
3931
  <g id="grid-x--2" class="grid grid-x">
3932
+ <path d="M 195.504702 468.317269 L 195.504702 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3933
  </g>
3934
  <g id="line2d_2">
3935
  <g>
3936
+ <use ns4:href="#mafb3703e5b" x="195.504702" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3937
  </g>
3938
  </g>
3939
  <g id="text_2">
3940
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.306823 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
3941
  </g>
3942
  </g>
3943
  <g id="xtick_3">
3944
  <g id="grid-x--3" class="grid grid-x">
3945
+ <path d="M 297.908185 468.317269 L 297.908185 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3946
  </g>
3947
  <g id="line2d_3">
3948
  <g>
3949
+ <use ns4:href="#mafb3703e5b" x="297.908185" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3950
  </g>
3951
  </g>
3952
  <g id="text_3">
3953
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.460822 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
3954
  </g>
3955
  </g>
3956
  <g id="xtick_4">
3957
  <g id="grid-x--4" class="grid grid-x">
3958
+ <path d="M 400.311668 468.317269 L 400.311668 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3959
  </g>
3960
  <g id="line2d_4">
3961
  <g>
3962
+ <use ns4:href="#mafb3703e5b" x="400.311668" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3963
  </g>
3964
  </g>
3965
  <g id="text_4">
3966
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.864305 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
3967
  </g>
3968
  </g>
3969
  <g id="xtick_5">
3970
  <g id="grid-x--5" class="grid grid-x">
3971
+ <path d="M 502.71515 468.317269 L 502.71515 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3972
  </g>
3973
  <g id="line2d_5">
3974
  <g>
3975
+ <use ns4:href="#mafb3703e5b" x="502.71515" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3976
  </g>
3977
  </g>
3978
  <g id="text_5">
3979
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.517271 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
3980
  </g>
3981
  </g>
3982
  <g id="xtick_6">
3983
  <g id="grid-x--6" class="grid grid-x">
3984
+ <path d="M 605.118633 468.317269 L 605.118633 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3985
  </g>
3986
  <g id="line2d_6">
3987
  <g>
3988
+ <use ns4:href="#mafb3703e5b" x="605.118633" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
3989
  </g>
3990
  </g>
3991
  <g id="text_6">
3992
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.920754 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
3993
  </g>
3994
  </g>
3995
  <g id="xtick_7">
3996
  <g id="grid-x--7" class="grid grid-x">
3997
+ <path d="M 707.522116 468.317269 L 707.522116 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
3998
  </g>
3999
  <g id="line2d_7">
4000
  <g>
4001
+ <use ns4:href="#mafb3703e5b" x="707.522116" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4002
  </g>
4003
  </g>
4004
  <g id="text_7">
4005
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.074754 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
4006
  </g>
4007
  </g>
4008
  <g id="xtick_8">
4009
  <g id="grid-x--8" class="grid grid-x">
4010
+ <path d="M 809.925599 468.317269 L 809.925599 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4011
  </g>
4012
  <g id="line2d_8">
4013
  <g>
4014
+ <use ns4:href="#mafb3703e5b" x="809.925599" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4015
  </g>
4016
  </g>
4017
  <g id="text_8">
4018
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.478237 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
4019
  </g>
4020
  </g>
4021
  <g id="label--x" class="xlabel">
4022
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="562.556245" transform="rotate(-0 451.513409 562.556245)">Workload</text>
4023
  </g>
4024
  </g>
4025
  <g id="matplotlib.axis_2">
4026
  <g id="ytick_1">
4027
  <g id="grid-y--2" class="grid grid-y">
4028
+ <path d="M 57.26 448.91253 L 845.766818 448.91253 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4029
  </g>
4030
  <g id="line2d_9">
4031
  <defs>
4032
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4033
  </defs>
4034
  <g>
4035
+ <use ns4:href="#m0fca2865ba" x="57.26" y="448.91253" style="stroke: #000000; stroke-width: 0.8" />
4036
  </g>
4037
  </g>
4038
  <g id="text_9">
4039
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="452.711749" transform="rotate(-0 50.26 452.711749)">0</text>
4040
  </g>
4041
  </g>
4042
  <g id="ytick_2">
4043
  <g id="grid-y--3" class="grid grid-y">
4044
+ <path d="M 57.26 399.227119 L 845.766818 399.227119 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4045
  </g>
4046
  <g id="line2d_10">
4047
  <g>
4048
+ <use ns4:href="#m0fca2865ba" x="57.26" y="399.227119" style="stroke: #000000; stroke-width: 0.8" />
4049
  </g>
4050
  </g>
4051
  <g id="text_10">
4052
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="403.026338" transform="rotate(-0 50.26 403.026338)">200</text>
4053
  </g>
4054
  </g>
4055
  <g id="ytick_3">
4056
  <g id="grid-y--4" class="grid grid-y">
4057
+ <path d="M 57.26 349.541708 L 845.766818 349.541708 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4058
  </g>
4059
  <g id="line2d_11">
4060
  <g>
4061
+ <use ns4:href="#m0fca2865ba" x="57.26" y="349.541708" style="stroke: #000000; stroke-width: 0.8" />
4062
  </g>
4063
  </g>
4064
  <g id="text_11">
4065
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="353.340926" transform="rotate(-0 50.26 353.340926)">400</text>
4066
  </g>
4067
  </g>
4068
  <g id="ytick_4">
4069
  <g id="grid-y--5" class="grid grid-y">
4070
+ <path d="M 57.26 299.856296 L 845.766818 299.856296 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4071
  </g>
4072
  <g id="line2d_12">
4073
  <g>
4074
+ <use ns4:href="#m0fca2865ba" x="57.26" y="299.856296" style="stroke: #000000; stroke-width: 0.8" />
4075
  </g>
4076
  </g>
4077
  <g id="text_12">
4078
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="303.655515" transform="rotate(-0 50.26 303.655515)">600</text>
4079
  </g>
4080
  </g>
4081
  <g id="ytick_5">
4082
  <g id="grid-y--6" class="grid grid-y">
4083
+ <path d="M 57.26 250.170885 L 845.766818 250.170885 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4084
  </g>
4085
  <g id="line2d_13">
4086
  <g>
4087
+ <use ns4:href="#m0fca2865ba" x="57.26" y="250.170885" style="stroke: #000000; stroke-width: 0.8" />
4088
  </g>
4089
  </g>
4090
  <g id="text_13">
4091
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="253.970104" transform="rotate(-0 50.26 253.970104)">800</text>
4092
  </g>
4093
  </g>
4094
  <g id="ytick_6">
4095
  <g id="grid-y--7" class="grid grid-y">
4096
+ <path d="M 57.26 200.485474 L 845.766818 200.485474 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4097
  </g>
4098
  <g id="line2d_14">
4099
  <g>
4100
+ <use ns4:href="#m0fca2865ba" x="57.26" y="200.485474" style="stroke: #000000; stroke-width: 0.8" />
4101
  </g>
4102
  </g>
4103
  <g id="text_14">
4104
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="204.284693" transform="rotate(-0 50.26 204.284693)">1000</text>
4105
  </g>
4106
  </g>
4107
  <g id="ytick_7">
4108
  <g id="grid-y--8" class="grid grid-y">
4109
+ <path d="M 57.26 150.800062 L 845.766818 150.800062 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4110
  </g>
4111
  <g id="line2d_15">
4112
  <g>
4113
+ <use ns4:href="#m0fca2865ba" x="57.26" y="150.800062" style="stroke: #000000; stroke-width: 0.8" />
4114
  </g>
4115
  </g>
4116
  <g id="text_15">
4117
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="154.599281" transform="rotate(-0 50.26 154.599281)">1200</text>
4118
+ </g>
4119
+ </g>
4120
+ <g id="ytick_8">
4121
+ <g id="grid-y--9" class="grid grid-y">
4122
+ <path d="M 57.26 101.114651 L 845.766818 101.114651 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4123
+ </g>
4124
+ <g id="line2d_16">
4125
+ <g>
4126
+ <use ns4:href="#m0fca2865ba" x="57.26" y="101.114651" style="stroke: #000000; stroke-width: 0.8" />
4127
+ </g>
4128
+ </g>
4129
+ <g id="text_16">
4130
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="104.91387" transform="rotate(-0 50.26 104.91387)">1400</text>
4131
+ </g>
4132
+ </g>
4133
+ <g id="ytick_9">
4134
+ <g id="grid-y--10" class="grid grid-y">
4135
+ <path d="M 57.26 51.42924 L 845.766818 51.42924 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4136
+ </g>
4137
+ <g id="line2d_17">
4138
+ <g>
4139
+ <use ns4:href="#m0fca2865ba" x="57.26" y="51.42924" style="stroke: #000000; stroke-width: 0.8" />
4140
+ </g>
4141
+ </g>
4142
+ <g id="text_17">
4143
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="55.228459" transform="rotate(-0 50.26 55.228459)">1600</text>
4144
  </g>
4145
  </g>
4146
  <g id="label--y" class="ylabel">
4147
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.730313" y="247.598635" transform="rotate(-90 18.730313 247.598635)">Latency P50 (ms)</text>
4148
  </g>
4149
  </g>
4150
  <g id="series--binned-torch" class="series">
4151
+ <path d="M 93.101219 410.567585 L 195.504702 400.332462 L 297.908185 357.497295 L 400.311668 350.461107 L 502.71515 266.697997 L 605.118633 253.249871 L 707.522116 73.76468 L 809.925599 46.94533 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4152
  <defs>
4153
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4154
  </defs>
4155
+ <g clip-path="url(#pef1bcf59f7)">
4156
+ <use ns4:href="#md7efaf3aec" x="93.101219" y="410.567585" style="fill: #1f77b4; stroke: #1f77b4" />
4157
+ <use ns4:href="#md7efaf3aec" x="195.504702" y="400.332462" style="fill: #1f77b4; stroke: #1f77b4" />
4158
+ <use ns4:href="#md7efaf3aec" x="297.908185" y="357.497295" style="fill: #1f77b4; stroke: #1f77b4" />
4159
+ <use ns4:href="#md7efaf3aec" x="400.311668" y="350.461107" style="fill: #1f77b4; stroke: #1f77b4" />
4160
+ <use ns4:href="#md7efaf3aec" x="502.71515" y="266.697997" style="fill: #1f77b4; stroke: #1f77b4" />
4161
+ <use ns4:href="#md7efaf3aec" x="605.118633" y="253.249871" style="fill: #1f77b4; stroke: #1f77b4" />
4162
+ <use ns4:href="#md7efaf3aec" x="707.522116" y="73.76468" style="fill: #1f77b4; stroke: #1f77b4" />
4163
+ <use ns4:href="#md7efaf3aec" x="809.925599" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4164
  </g>
4165
  </g>
4166
  <g id="series--gpt-oss-experts" class="series">
4167
+ <path d="M 93.101219 448.251939 L 195.504702 447.930293 L 297.908185 447.951398 L 400.311668 447.585894 L 502.71515 447.222062 L 605.118633 447.041869 L 707.522116 445.587165 L 809.925599 445.581303 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4168
  <defs>
4169
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4170
  </defs>
4171
+ <g clip-path="url(#pef1bcf59f7)">
4172
+ <use ns4:href="#m9b8c54d372" x="93.101219" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4173
+ <use ns4:href="#m9b8c54d372" x="195.504702" y="447.930293" style="fill: #ff7f0e; stroke: #ff7f0e" />
4174
+ <use ns4:href="#m9b8c54d372" x="297.908185" y="447.951398" style="fill: #ff7f0e; stroke: #ff7f0e" />
4175
+ <use ns4:href="#m9b8c54d372" x="400.311668" y="447.585894" style="fill: #ff7f0e; stroke: #ff7f0e" />
4176
+ <use ns4:href="#m9b8c54d372" x="502.71515" y="447.222062" style="fill: #ff7f0e; stroke: #ff7f0e" />
4177
+ <use ns4:href="#m9b8c54d372" x="605.118633" y="447.041869" style="fill: #ff7f0e; stroke: #ff7f0e" />
4178
+ <use ns4:href="#m9b8c54d372" x="707.522116" y="445.587165" style="fill: #ff7f0e; stroke: #ff7f0e" />
4179
+ <use ns4:href="#m9b8c54d372" x="809.925599" y="445.581303" style="fill: #ff7f0e; stroke: #ff7f0e" />
4180
  </g>
4181
  </g>
4182
  <g id="patch_3">
4183
+ <path d="M 57.26 468.317269 L 57.26 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4184
  </g>
4185
  <g id="patch_4">
4186
  <path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4187
  </g>
4188
  <g id="patch_5">
4189
+ <path d="M 57.26 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4190
  </g>
4191
  <g id="patch_6">
4192
+ <path d="M 57.26 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4193
  </g>
4194
+ <g id="text_18">
4195
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="20.88" transform="rotate(-0 451.513409 20.88)">Attention Implementation Latency</text>
4196
  </g>
4197
  <g id="legend" class="legend">
4198
  <g id="patch_7">
4199
+ <path d="M 64.26 64.7925 L 177.05375 64.7925 Q 179.05375 64.7925 179.05375 62.7925 L 179.05375 33.88 Q 179.05375 31.88 177.05375 31.88 L 64.26 31.88 Q 62.26 31.88 62.26 33.88 L 62.26 62.7925 Q 62.26 64.7925 64.26 64.7925 L 64.26 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4200
  </g>
4201
+ <g id="line2d_18">
4202
+ <path d="M 66.26 39.978438 L 76.26 39.978438 L 86.26 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4203
  <g>
4204
+ <use ns4:href="#md7efaf3aec" x="76.26" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4205
  </g>
4206
  </g>
4207
  <g id="legend-label--binned-torch" class="legend">
4208
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="43.478438" transform="rotate(-0 94.26 43.478438)">binned_torch</text>
4209
  </g>
4210
+ <g id="line2d_19">
4211
+ <path d="M 66.26 54.934687 L 76.26 54.934687 L 86.26 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4212
  <g>
4213
+ <use ns4:href="#m9b8c54d372" x="76.26" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4214
  </g>
4215
  </g>
4216
  <g id="legend-label--gpt-oss-experts" class="legend">
4217
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="58.434687" transform="rotate(-0 94.26 58.434687)">gpt_oss_experts</text>
4218
  </g>
4219
  </g>
4220
  </g>
4221
  </g>
4222
  <defs>
4223
+ <clipPath id="pef1bcf59f7">
4224
+ <rect x="57.26" y="26.88" width="788.506818" height="441.437269" />
4225
  </clipPath>
4226
  </defs>
4227
  </svg>
 
4234
  <span onclick="toggleOutput('combine')" style="cursor: pointer;">▼ output</span>
4235
  <span id="uv-indicator-combine" onclick="toggleUvLogsFromHeader('combine')" style="cursor: pointer;">▶ uv-logs</span>
4236
  </span> |
4237
+ Cell: combine | 4.45s
4238
  | <button class="run-btn" onclick="runCell('combine')">▶ run</button>
4239
  <button class="copy-btn" onclick="copyCell('combine')">Copy</button>
4240
  <a href="cells/combine.py" target="_blank" class="raw-btn">Raw</a>
 
4323
  COMBINED BENCHMARK SUMMARY
4324
 
4325
  impl wl p50(ms) ok
4326
+ binned_torch cuda_B1_S1024_E2 367.98 True
4327
+ binned_torch cuda_B1_S1024_E4 396.30 True
4328
+ binned_torch cuda_B1_S512_E2 154.35 True
4329
+ binned_torch cuda_B1_S512_E4 195.55 True
4330
+ binned_torch cuda_B4_S1024_E2 1510.09 True
4331
+ binned_torch cuda_B4_S1024_E4 1618.05 True
4332
+ binned_torch cuda_B4_S512_E2 733.47 True
4333
+ binned_torch cuda_B4_S512_E4 787.61 True
4334
+ gpt_oss_experts cuda_B1_S1024_E2 3.87 True
4335
+ gpt_oss_experts cuda_B1_S1024_E4 5.34 True
4336
+ gpt_oss_experts cuda_B1_S512_E2 2.66 True
4337
+ gpt_oss_experts cuda_B1_S512_E4 3.95 True
4338
+ gpt_oss_experts cuda_B4_S1024_E2 13.39 True
4339
+ gpt_oss_experts cuda_B4_S1024_E4 13.41 True
4340
+ gpt_oss_experts cuda_B4_S512_E2 6.80 True
4341
+ gpt_oss_experts cuda_B4_S512_E4 7.53 True
4342
 
4343
  GENERATING COMBINED VISUALIZATION
4344
 
 
4358
  <div class="uv-install-logs" id="uv-logs-combine">
4359
  <div class="uv-logs-header" onclick="toggleUvLogs(this)">▶ UV Install Logs</div>
4360
  <div class="uv-logs-content" style="display: none;">
4361
+ Installed 37 packages in 266ms
4362
  </div>
4363
  </div>
4364
  <div class="cell-artifacts">
 
4371
  <rdf:RDF>
4372
  <ns2:Work>
4373
  <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
4374
+ <dc:date>2025-12-19T23:02:40.893386</dc:date>
4375
  <dc:format>image/svg+xml</dc:format>
4376
  <dc:creator>
4377
  <ns2:Agent>
 
4390
  </g>
4391
  <g id="axes--1" class="axes">
4392
  <g id="patch_2">
4393
+ <path d="M 57.26 468.317269 L 845.766818 468.317269 L 845.766818 26.88 L 57.26 26.88 L 57.26 468.317269 z " style="fill: none" />
4394
  </g>
4395
  <g id="matplotlib.axis_1">
4396
  <g id="xtick_1">
4397
  <g id="grid-x--1" class="grid grid-x">
4398
+ <path d="M 93.101219 468.317269 L 93.101219 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4399
  </g>
4400
  <g id="line2d_1">
4401
  <defs>
4402
  <path id="mafb3703e5b" d="M 0 0 L 0 3.5 " style="stroke: #000000; stroke-width: 0.8" />
4403
  </defs>
4404
  <g>
4405
+ <use ns4:href="#mafb3703e5b" x="93.101219" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4406
  </g>
4407
  </g>
4408
  <g id="text_1">
4409
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(62.90334 544.791615) rotate(-45)">cuda_B1_S512_E2</text>
4410
  </g>
4411
  </g>
4412
  <g id="xtick_2">
4413
  <g id="grid-x--2" class="grid grid-x">
4414
+ <path d="M 195.504702 468.317269 L 195.504702 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4415
  </g>
4416
  <g id="line2d_2">
4417
  <g>
4418
+ <use ns4:href="#mafb3703e5b" x="195.504702" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4419
  </g>
4420
  </g>
4421
  <g id="text_2">
4422
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(165.306823 544.791615) rotate(-45)">cuda_B1_S512_E4</text>
4423
  </g>
4424
  </g>
4425
  <g id="xtick_3">
4426
  <g id="grid-x--3" class="grid grid-x">
4427
+ <path d="M 297.908185 468.317269 L 297.908185 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4428
  </g>
4429
  <g id="line2d_3">
4430
  <g>
4431
+ <use ns4:href="#mafb3703e5b" x="297.908185" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4432
  </g>
4433
  </g>
4434
  <g id="text_3">
4435
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(265.460822 549.290582) rotate(-45)">cuda_B1_S1024_E2</text>
4436
  </g>
4437
  </g>
4438
  <g id="xtick_4">
4439
  <g id="grid-x--4" class="grid grid-x">
4440
+ <path d="M 400.311668 468.317269 L 400.311668 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4441
  </g>
4442
  <g id="line2d_4">
4443
  <g>
4444
+ <use ns4:href="#mafb3703e5b" x="400.311668" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4445
  </g>
4446
  </g>
4447
  <g id="text_4">
4448
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(367.864305 549.290582) rotate(-45)">cuda_B1_S1024_E4</text>
4449
  </g>
4450
  </g>
4451
  <g id="xtick_5">
4452
  <g id="grid-x--5" class="grid grid-x">
4453
+ <path d="M 502.71515 468.317269 L 502.71515 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4454
  </g>
4455
  <g id="line2d_5">
4456
  <g>
4457
+ <use ns4:href="#mafb3703e5b" x="502.71515" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4458
  </g>
4459
  </g>
4460
  <g id="text_5">
4461
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(472.517271 544.791615) rotate(-45)">cuda_B4_S512_E2</text>
4462
  </g>
4463
  </g>
4464
  <g id="xtick_6">
4465
  <g id="grid-x--6" class="grid grid-x">
4466
+ <path d="M 605.118633 468.317269 L 605.118633 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4467
  </g>
4468
  <g id="line2d_6">
4469
  <g>
4470
+ <use ns4:href="#mafb3703e5b" x="605.118633" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4471
  </g>
4472
  </g>
4473
  <g id="text_6">
4474
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(574.920754 544.791615) rotate(-45)">cuda_B4_S512_E4</text>
4475
  </g>
4476
  </g>
4477
  <g id="xtick_7">
4478
  <g id="grid-x--7" class="grid grid-x">
4479
+ <path d="M 707.522116 468.317269 L 707.522116 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4480
  </g>
4481
  <g id="line2d_7">
4482
  <g>
4483
+ <use ns4:href="#mafb3703e5b" x="707.522116" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4484
  </g>
4485
  </g>
4486
  <g id="text_7">
4487
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(675.074754 549.290582) rotate(-45)">cuda_B4_S1024_E2</text>
4488
  </g>
4489
  </g>
4490
  <g id="xtick_8">
4491
  <g id="grid-x--8" class="grid grid-x">
4492
+ <path d="M 809.925599 468.317269 L 809.925599 26.88 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4493
  </g>
4494
  <g id="line2d_8">
4495
  <g>
4496
+ <use ns4:href="#mafb3703e5b" x="809.925599" y="468.317269" style="stroke: #000000; stroke-width: 0.8" />
4497
  </g>
4498
  </g>
4499
  <g id="text_8">
4500
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif" transform="translate(777.478237 549.290582) rotate(-45)">cuda_B4_S1024_E4</text>
4501
  </g>
4502
  </g>
4503
  <g id="label--x" class="xlabel">
4504
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="562.556245" transform="rotate(-0 451.513409 562.556245)">Workload</text>
4505
  </g>
4506
  </g>
4507
  <g id="matplotlib.axis_2">
4508
  <g id="ytick_1">
4509
  <g id="grid-y--2" class="grid grid-y">
4510
+ <path d="M 57.26 448.91253 L 845.766818 448.91253 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4511
  </g>
4512
  <g id="line2d_9">
4513
  <defs>
4514
  <path id="m0fca2865ba" d="M 0 0 L -3.5 0 " style="stroke: #000000; stroke-width: 0.8" />
4515
  </defs>
4516
  <g>
4517
+ <use ns4:href="#m0fca2865ba" x="57.26" y="448.91253" style="stroke: #000000; stroke-width: 0.8" />
4518
  </g>
4519
  </g>
4520
  <g id="text_9">
4521
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="452.711749" transform="rotate(-0 50.26 452.711749)">0</text>
4522
  </g>
4523
  </g>
4524
  <g id="ytick_2">
4525
  <g id="grid-y--3" class="grid grid-y">
4526
+ <path d="M 57.26 399.227119 L 845.766818 399.227119 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4527
  </g>
4528
  <g id="line2d_10">
4529
  <g>
4530
+ <use ns4:href="#m0fca2865ba" x="57.26" y="399.227119" style="stroke: #000000; stroke-width: 0.8" />
4531
  </g>
4532
  </g>
4533
  <g id="text_10">
4534
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="403.026338" transform="rotate(-0 50.26 403.026338)">200</text>
4535
  </g>
4536
  </g>
4537
  <g id="ytick_3">
4538
  <g id="grid-y--4" class="grid grid-y">
4539
+ <path d="M 57.26 349.541708 L 845.766818 349.541708 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4540
  </g>
4541
  <g id="line2d_11">
4542
  <g>
4543
+ <use ns4:href="#m0fca2865ba" x="57.26" y="349.541708" style="stroke: #000000; stroke-width: 0.8" />
4544
  </g>
4545
  </g>
4546
  <g id="text_11">
4547
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="353.340926" transform="rotate(-0 50.26 353.340926)">400</text>
4548
  </g>
4549
  </g>
4550
  <g id="ytick_4">
4551
  <g id="grid-y--5" class="grid grid-y">
4552
+ <path d="M 57.26 299.856296 L 845.766818 299.856296 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4553
  </g>
4554
  <g id="line2d_12">
4555
  <g>
4556
+ <use ns4:href="#m0fca2865ba" x="57.26" y="299.856296" style="stroke: #000000; stroke-width: 0.8" />
4557
  </g>
4558
  </g>
4559
  <g id="text_12">
4560
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="303.655515" transform="rotate(-0 50.26 303.655515)">600</text>
4561
  </g>
4562
  </g>
4563
  <g id="ytick_5">
4564
  <g id="grid-y--6" class="grid grid-y">
4565
+ <path d="M 57.26 250.170885 L 845.766818 250.170885 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4566
  </g>
4567
  <g id="line2d_13">
4568
  <g>
4569
+ <use ns4:href="#m0fca2865ba" x="57.26" y="250.170885" style="stroke: #000000; stroke-width: 0.8" />
4570
  </g>
4571
  </g>
4572
  <g id="text_13">
4573
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="253.970104" transform="rotate(-0 50.26 253.970104)">800</text>
4574
  </g>
4575
  </g>
4576
  <g id="ytick_6">
4577
  <g id="grid-y--7" class="grid grid-y">
4578
+ <path d="M 57.26 200.485474 L 845.766818 200.485474 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4579
  </g>
4580
  <g id="line2d_14">
4581
  <g>
4582
+ <use ns4:href="#m0fca2865ba" x="57.26" y="200.485474" style="stroke: #000000; stroke-width: 0.8" />
4583
  </g>
4584
  </g>
4585
  <g id="text_14">
4586
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="204.284693" transform="rotate(-0 50.26 204.284693)">1000</text>
4587
  </g>
4588
  </g>
4589
  <g id="ytick_7">
4590
  <g id="grid-y--8" class="grid grid-y">
4591
+ <path d="M 57.26 150.800062 L 845.766818 150.800062 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4592
  </g>
4593
  <g id="line2d_15">
4594
  <g>
4595
+ <use ns4:href="#m0fca2865ba" x="57.26" y="150.800062" style="stroke: #000000; stroke-width: 0.8" />
4596
  </g>
4597
  </g>
4598
  <g id="text_15">
4599
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="154.599281" transform="rotate(-0 50.26 154.599281)">1200</text>
4600
+ </g>
4601
+ </g>
4602
+ <g id="ytick_8">
4603
+ <g id="grid-y--9" class="grid grid-y">
4604
+ <path d="M 57.26 101.114651 L 845.766818 101.114651 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4605
+ </g>
4606
+ <g id="line2d_16">
4607
+ <g>
4608
+ <use ns4:href="#m0fca2865ba" x="57.26" y="101.114651" style="stroke: #000000; stroke-width: 0.8" />
4609
+ </g>
4610
+ </g>
4611
+ <g id="text_16">
4612
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="104.91387" transform="rotate(-0 50.26 104.91387)">1400</text>
4613
+ </g>
4614
+ </g>
4615
+ <g id="ytick_9">
4616
+ <g id="grid-y--10" class="grid grid-y">
4617
+ <path d="M 57.26 51.42924 L 845.766818 51.42924 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #b0b0b0; stroke-opacity: 0.3; stroke-width: 0.8; stroke-linecap: square" />
4618
+ </g>
4619
+ <g id="line2d_17">
4620
+ <g>
4621
+ <use ns4:href="#m0fca2865ba" x="57.26" y="51.42924" style="stroke: #000000; stroke-width: 0.8" />
4622
+ </g>
4623
+ </g>
4624
+ <g id="text_17">
4625
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: end" x="50.26" y="55.228459" transform="rotate(-0 50.26 55.228459)">1600</text>
4626
  </g>
4627
  </g>
4628
  <g id="label--y" class="ylabel">
4629
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="18.730313" y="247.598635" transform="rotate(-90 18.730313 247.598635)">Latency P50 (ms)</text>
4630
  </g>
4631
  </g>
4632
  <g id="series--binned-torch" class="series">
4633
+ <path d="M 93.101219 410.567585 L 195.504702 400.332462 L 297.908185 357.497295 L 400.311668 350.461107 L 502.71515 266.697997 L 605.118633 253.249871 L 707.522116 73.76468 L 809.925599 46.94533 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4634
  <defs>
4635
  <path id="md7efaf3aec" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #1f77b4" />
4636
  </defs>
4637
+ <g clip-path="url(#pef1bcf59f7)">
4638
+ <use ns4:href="#md7efaf3aec" x="93.101219" y="410.567585" style="fill: #1f77b4; stroke: #1f77b4" />
4639
+ <use ns4:href="#md7efaf3aec" x="195.504702" y="400.332462" style="fill: #1f77b4; stroke: #1f77b4" />
4640
+ <use ns4:href="#md7efaf3aec" x="297.908185" y="357.497295" style="fill: #1f77b4; stroke: #1f77b4" />
4641
+ <use ns4:href="#md7efaf3aec" x="400.311668" y="350.461107" style="fill: #1f77b4; stroke: #1f77b4" />
4642
+ <use ns4:href="#md7efaf3aec" x="502.71515" y="266.697997" style="fill: #1f77b4; stroke: #1f77b4" />
4643
+ <use ns4:href="#md7efaf3aec" x="605.118633" y="253.249871" style="fill: #1f77b4; stroke: #1f77b4" />
4644
+ <use ns4:href="#md7efaf3aec" x="707.522116" y="73.76468" style="fill: #1f77b4; stroke: #1f77b4" />
4645
+ <use ns4:href="#md7efaf3aec" x="809.925599" y="46.94533" style="fill: #1f77b4; stroke: #1f77b4" />
4646
  </g>
4647
  </g>
4648
  <g id="series--gpt-oss-experts" class="series">
4649
+ <path d="M 93.101219 448.251939 L 195.504702 447.930293 L 297.908185 447.951398 L 400.311668 447.585894 L 502.71515 447.222062 L 605.118633 447.041869 L 707.522116 445.587165 L 809.925599 445.581303 " clip-path="url(#pef1bcf59f7)" style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4650
  <defs>
4651
  <path id="m9b8c54d372" d="M 0 3 C 0.795609 3 1.55874 2.683901 2.12132 2.12132 C 2.683901 1.55874 3 0.795609 3 0 C 3 -0.795609 2.683901 -1.55874 2.12132 -2.12132 C 1.55874 -2.683901 0.795609 -3 0 -3 C -0.795609 -3 -1.55874 -2.683901 -2.12132 -2.12132 C -2.683901 -1.55874 -3 -0.795609 -3 0 C -3 0.795609 -2.683901 1.55874 -2.12132 2.12132 C -1.55874 2.683901 -0.795609 3 0 3 z " style="stroke: #ff7f0e" />
4652
  </defs>
4653
+ <g clip-path="url(#pef1bcf59f7)">
4654
+ <use ns4:href="#m9b8c54d372" x="93.101219" y="448.251939" style="fill: #ff7f0e; stroke: #ff7f0e" />
4655
+ <use ns4:href="#m9b8c54d372" x="195.504702" y="447.930293" style="fill: #ff7f0e; stroke: #ff7f0e" />
4656
+ <use ns4:href="#m9b8c54d372" x="297.908185" y="447.951398" style="fill: #ff7f0e; stroke: #ff7f0e" />
4657
+ <use ns4:href="#m9b8c54d372" x="400.311668" y="447.585894" style="fill: #ff7f0e; stroke: #ff7f0e" />
4658
+ <use ns4:href="#m9b8c54d372" x="502.71515" y="447.222062" style="fill: #ff7f0e; stroke: #ff7f0e" />
4659
+ <use ns4:href="#m9b8c54d372" x="605.118633" y="447.041869" style="fill: #ff7f0e; stroke: #ff7f0e" />
4660
+ <use ns4:href="#m9b8c54d372" x="707.522116" y="445.587165" style="fill: #ff7f0e; stroke: #ff7f0e" />
4661
+ <use ns4:href="#m9b8c54d372" x="809.925599" y="445.581303" style="fill: #ff7f0e; stroke: #ff7f0e" />
4662
  </g>
4663
  </g>
4664
  <g id="patch_3">
4665
+ <path d="M 57.26 468.317269 L 57.26 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4666
  </g>
4667
  <g id="patch_4">
4668
  <path d="M 845.766818 468.317269 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4669
  </g>
4670
  <g id="patch_5">
4671
+ <path d="M 57.26 468.317269 L 845.766818 468.317269 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4672
  </g>
4673
  <g id="patch_6">
4674
+ <path d="M 57.26 26.88 L 845.766818 26.88 " style="fill: none; stroke: #000000; stroke-width: 0.8; stroke-linejoin: miter; stroke-linecap: square" />
4675
  </g>
4676
+ <g id="text_18">
4677
+ <text style="font-size: 12px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: middle" x="451.513409" y="20.88" transform="rotate(-0 451.513409 20.88)">Attention Implementation Latency</text>
4678
  </g>
4679
  <g id="legend" class="legend">
4680
  <g id="patch_7">
4681
+ <path d="M 64.26 64.7925 L 177.05375 64.7925 Q 179.05375 64.7925 179.05375 62.7925 L 179.05375 33.88 Q 179.05375 31.88 177.05375 31.88 L 64.26 31.88 Q 62.26 31.88 62.26 33.88 L 62.26 62.7925 Q 62.26 64.7925 64.26 64.7925 L 64.26 64.7925 z " style="fill: none; opacity: 0.8; stroke: #cccccc; stroke-linejoin: miter" />
4682
  </g>
4683
+ <g id="line2d_18">
4684
+ <path d="M 66.26 39.978438 L 76.26 39.978438 L 86.26 39.978438 " style="fill: none; stroke: #1f77b4; stroke-width: 1.5; stroke-linecap: square" />
4685
  <g>
4686
+ <use ns4:href="#md7efaf3aec" x="76.26" y="39.978438" style="fill: #1f77b4; stroke: #1f77b4" />
4687
  </g>
4688
  </g>
4689
  <g id="legend-label--binned-torch" class="legend">
4690
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="43.478438" transform="rotate(-0 94.26 43.478438)">binned_torch</text>
4691
  </g>
4692
+ <g id="line2d_19">
4693
+ <path d="M 66.26 54.934687 L 76.26 54.934687 L 86.26 54.934687 " style="fill: none; stroke: #ff7f0e; stroke-width: 1.5; stroke-linecap: square" />
4694
  <g>
4695
+ <use ns4:href="#m9b8c54d372" x="76.26" y="54.934687" style="fill: #ff7f0e; stroke: #ff7f0e" />
4696
  </g>
4697
  </g>
4698
  <g id="legend-label--gpt-oss-experts" class="legend">
4699
+ <text style="font-size: 10px; font-family: 'DejaVu Sans', 'Bitstream Vera Sans', 'Computer Modern Sans Serif', 'Lucida Grande', 'Verdana', 'Geneva', 'Lucid', 'Arial', 'Helvetica', 'Avant Garde', sans-serif; text-anchor: start" x="94.26" y="58.434687" transform="rotate(-0 94.26 58.434687)">gpt_oss_experts</text>
4700
  </g>
4701
  </g>
4702
  </g>
4703
  </g>
4704
  <defs>
4705
+ <clipPath id="pef1bcf59f7">
4706
+ <rect x="57.26" y="26.88" width="788.506818" height="441.437269" />
4707
  </clipPath>
4708
  </defs>
4709
  </svg>
rotary/impls/artifacts/benchmark/rotary.jsonl CHANGED
@@ -1,24 +1,24 @@
1
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.07445200003530772, "p50": 0.07589100005134242, "p90": 0.07600200001434132, "mean": 0.0754678000248532, "iqr": 0.0014600000213249587, "raw_times": [0.0764520000302582, 0.07600200001434132, 0.07589100005134242, 0.07454199999301636, 0.07445200003530772], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08018199991965957, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null}
2
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08898300006876525, "p50": 0.09174199999506527, "p90": 0.09300300007453188, "mean": 0.09168480000880663, "iqr": 0.0013200001376389991, "raw_times": [0.09168299993689288, 0.09300300007453188, 0.09174199999506527, 0.09301299996877788, 0.08898300006876525], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09485199984737847, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null}
3
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08705300001565774, "p50": 0.09316300020145718, "p90": 0.10223200001746591, "mean": 0.09793460003493237, "iqr": 0.013889999991079094, "raw_times": [0.10223200001746591, 0.09316300020145718, 0.11888299991369422, 0.08834200002638681, 0.08705300001565774], "has_warnings": true, "reps": 5, "warmup": 2}, "compile_ms": 0.09152200004791666, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null}
4
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0873520000368444, "p50": 0.08926300006351084, "p90": 0.08946200000536919, "mean": 0.08885220004231087, "iqr": 0.0013999999737279722, "raw_times": [0.0873520000368444, 0.09012200007418869, 0.08926300006351084, 0.08806200003164122, 0.08946200000536919], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09113200007959676, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null}
5
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08857299985720601, "p50": 0.09020299989970226, "p90": 0.09035299990500789, "mean": 0.0900987999557401, "iqr": 0.00085999977272877, "raw_times": [0.08949300013227912, 0.09020299989970226, 0.09035299990500789, 0.08857299985720601, 0.09187199998450524], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09262200001103338, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null}
6
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08818200012683519, "p50": 0.08903200000531797, "p90": 0.08924200005822058, "mean": 0.08891200000107347, "iqr": 0.0008400002116104588, "raw_times": [0.08903200000531797, 0.08840199984661012, 0.08818200012683519, 0.08924200005822058, 0.08970199996838346], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09153199994216266, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null}
7
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08838300004754274, "p50": 0.08999199985737505, "p90": 0.0905619999684859, "mean": 0.09254639999198844, "iqr": 0.0011899999208253575, "raw_times": [0.0905619999684859, 0.10442300003887794, 0.08937200004766055, 0.08999199985737505, 0.08838300004754274], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10543200005486142, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null}
8
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08926200007408625, "p50": 0.08963200002654048, "p90": 0.09017300021696428, "mean": 0.08983220009213255, "iqr": 0.0005410001904238015, "raw_times": [0.08963200002654048, 0.09017300021696428, 0.08926200007408625, 0.08963200002654048, 0.09046200011653127], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09046300010595587, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null}
9
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08713199986232212, "p50": 0.08950200003710052, "p90": 0.08994299992082233, "mean": 0.08932219993766921, "iqr": 0.0012210000477352878, "raw_times": [0.08872199987308704, 0.08950200003710052, 0.08994299992082233, 0.09131199999501405, 0.08713199986232212], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09138199993685703, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null}
10
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08889199989425833, "p50": 0.09063199991032889, "p90": 0.09075200000552286, "mean": 0.09513419990980765, "iqr": 0.0011500001164677087, "raw_times": [0.08889199989425833, 0.08960199988905515, 0.09075200000552286, 0.09063199991032889, 0.11579299984987301], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09194200015372189, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null}
11
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08985299996311369, "p50": 0.09080199993150018, "p90": 0.09128199985752872, "mean": 0.09099019994209812, "iqr": 0.0010899998414970469, "raw_times": [0.08985299996311369, 0.09019200001603167, 0.09128199985752872, 0.09282199994231632, 0.09080199993150018], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09297199994762195, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null}
12
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2603559998988203, "p50": 0.2619460001369589, "p90": 0.2620170000682265, "mean": 0.26208240001324157, "iqr": 0.00017100001059588976, "raw_times": [0.2619460001369589, 0.2603559998988203, 0.2618460000576306, 0.26424699990457157, 0.2620170000682265], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26098600005752814, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null}
13
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0871929998993437, "p50": 0.08902200011107197, "p90": 0.08904199989956396, "mean": 0.08843839996188763, "iqr": 0.0015389998679893324, "raw_times": [0.08750300003157463, 0.08904199989956396, 0.08943199986788386, 0.08902200011107197, 0.0871929998993437], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09225299982063007, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null}
14
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0884020000739838, "p50": 0.08922200004235492, "p90": 0.08970199996838346, "mean": 0.08935000005294569, "iqr": 0.0005199999577598646, "raw_times": [0.08922200004235492, 0.0884020000739838, 0.09024200016938266, 0.08970199996838346, 0.0891820000106236], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09462200000598386, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null}
15
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08808200004750688, "p50": 0.08919200013224327, "p90": 0.09035200014295697, "mean": 0.0894580000476708, "iqr": 0.0017100001059588976, "raw_times": [0.08808200004750688, 0.08919200013224327, 0.09035200014295697, 0.09102199987864878, 0.08864200003699807], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0948819999848638, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null}
16
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08770199997343298, "p50": 0.08942199997363787, "p90": 0.08942299996306247, "mean": 0.08932019995882001, "iqr": 0.0003010000000358559, "raw_times": [0.08912199996302661, 0.09093199992094014, 0.08770199997343298, 0.08942299996306247, 0.08942199997363787], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09424199993190996, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null}
17
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08902200011107197, "p50": 0.09058199998435157, "p90": 0.09125299993684166, "mean": 0.09215640002366854, "iqr": 0.0012609998520929366, "raw_times": [0.09993300000132876, 0.09058199998435157, 0.09125299993684166, 0.08902200011107197, 0.08999200008474872], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09323300014330016, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null}
18
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08776200002102996, "p50": 0.08892300002116826, "p90": 0.08966199993665214, "mean": 0.0888985999608849, "iqr": 0.001638999947317643, "raw_times": [0.09012299983623961, 0.08892300002116826, 0.08966199993665214, 0.08776200002102996, 0.0880229999893345], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09301299996877788, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null}
19
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0880819998201332, "p50": 0.08909200005291495, "p90": 0.08928200008995191, "mean": 0.08905000004233443, "iqr": 0.00023999996301427018, "raw_times": [0.0880819998201332, 0.08904200012693764, 0.08975200012173445, 0.08909200005291495, 0.08928200008995191], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09304200011683861, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null}
20
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08700200010025583, "p50": 0.08849200003169244, "p90": 0.0890119999894523, "mean": 0.08845600000313425, "iqr": 0.0007099999947968172, "raw_times": [0.0890119999894523, 0.08830199999465549, 0.08947199989961518, 0.08700200010025583, 0.08849200003169244], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09235300012733205, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null}
21
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08904200012693764, "p50": 0.0900719999208377, "p90": 0.09035200014295697, "mean": 0.09022600006574066, "iqr": 0.0009600000794307562, "raw_times": [0.09035200014295697, 0.08904200012693764, 0.0922720000744448, 0.08939200006352621, 0.0900719999208377], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09144199998445401, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null}
22
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08612200008428772, "p50": 0.08916199999475793, "p90": 0.08966199993665214, "mean": 0.08842420002110885, "iqr": 0.002328999926248798, "raw_times": [0.08612200008428772, 0.0898420000794431, 0.08916199999475793, 0.08733300001040334, 0.08966199993665214], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09376299999530602, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null}
23
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2547460001096624, "p50": 0.25804599999901257, "p90": 0.2586460000202351, "mean": 0.25757600001270475, "iqr": 0.0013200001376389991, "raw_times": [0.2547460001096624, 0.2591160000520176, 0.25804599999901257, 0.2586460000202351, 0.2573259998825961], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.25434600001972285, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null}
24
- {"ts": "2025-12-19T19:55:20Z", "run": "c99aba47df8541d191cab69afa5e530c", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8454199999050616, "p50": 0.8495600000060222, "p90": 0.8538209999642277, "mean": 0.8503745999860257, "iqr": 0.0067099999796482734, "raw_times": [0.8559610000702378, 0.8538209999642277, 0.8471109999845794, 0.8495600000060222, 0.8454199999050616], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8642910001981363, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null}
 
1
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0716920001195831, "p50": 0.07250199996633455, "p90": 0.07283199988705746, "mean": 0.07240760000968294, "iqr": 0.0009109999155043624, "raw_times": [0.07250199996633455, 0.07283199988705746, 0.0716920001195831, 0.0719209999715531, 0.07309100010388647], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08089199991445639, "peak_bytes": 3178496, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590452924915553e-08, "mae_k": 1.5487040982975486e-08, "mse_q": 2.5241010080938753e-15, "mse_k": 2.364223539299626e-15, "ref": "rotary_torch"}, "err": null}
2
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H8_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08928200008995191, "p50": 0.08997100007945846, "p90": 0.0909519999368058, "mean": 0.09690580000096816, "iqr": 0.001269999984288006, "raw_times": [0.08928200008995191, 0.08997100007945846, 0.0896819999525178, 0.0909519999368058, 0.12464199994610681], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10248300009152445, "peak_bytes": 6356992, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5508486939097565e-08, "mae_k": 1.567566698668088e-08, "mse_q": 2.3630110116356316e-15, "mse_k": 2.416562128626943e-15, "ref": "rotary_torch"}, "err": null}
3
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D64_R32", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08800199998404423, "p50": 0.08953199994721217, "p90": 0.09005199990497204, "mean": 0.08934599995882309, "iqr": 0.0015899997833912494, "raw_times": [0.08800199998404423, 0.0906819998363062, 0.08953199994721217, 0.08846200012158079, 0.09005199990497204], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08988199988380075, "peak_bytes": 12615680, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5856898016863852e-08, "mae_k": 1.572981211950264e-08, "mse_q": 2.4771055025978386e-15, "mse_k": 2.4544071371937915e-15, "ref": "rotary_torch"}, "err": null}
4
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S128_H32_D128_R64", "batch": 1, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08559200000490819, "p50": 0.08698200008439017, "p90": 0.08820199991532718, "mean": 0.08761399999457353, "iqr": 0.0018399998680251883, "raw_times": [0.08820199991532718, 0.08698200008439017, 0.09093199992094014, 0.08559200000490819, 0.08636200004730199], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09196199994221388, "peak_bytes": 25231360, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5617658277733426e-08, "mae_k": 1.5788685914230882e-08, "mse_q": 2.4549424620164562e-15, "mse_k": 2.492823469483563e-15, "ref": "rotary_torch"}, "err": null}
5
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0869620000685245, "p50": 0.08914199997889227, "p90": 0.08982199983620376, "mean": 0.08910199994716095, "iqr": 0.0019899998733308166, "raw_times": [0.08783199996287294, 0.08914199997889227, 0.08982199983620376, 0.0869620000685245, 0.09175199988931126], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08990199989966641, "peak_bytes": 12779520, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5962712041073246e-08, "mae_k": 1.5743363945830424e-08, "mse_q": 2.534145124782417e-15, "mse_k": 2.451281585618423e-15, "ref": "rotary_torch"}, "err": null}
6
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H8_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08547199990971421, "p50": 0.08752200005801569, "p90": 0.08800199998404423, "mean": 0.08713399993212079, "iqr": 0.002420000100755715, "raw_times": [0.08558199988328852, 0.08909199982554128, 0.08752200005801569, 0.08547199990971421, 0.08800199998404423], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0911110000743065, "peak_bytes": 25427968, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.578730035589615e-08, "mae_k": 1.5859711766097462e-08, "mse_q": 2.440287521479536e-15, "mse_k": 2.477901290051784e-15, "ref": "rotary_torch"}, "err": null}
7
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D64_R32", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08664199981467391, "p50": 0.0881319999734842, "p90": 0.08973199987849512, "mean": 0.08822799991321517, "iqr": 0.0029299999368959107, "raw_times": [0.08664199981467391, 0.08983199995782343, 0.0881319999734842, 0.0868019999415992, 0.08973199987849512], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08937200004766055, "peak_bytes": 50462720, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5775295736375483e-08, "mae_k": 1.5847881229547056e-08, "mse_q": 2.471039476146077e-15, "mse_k": 2.472378635235686e-15, "ref": "rotary_torch"}, "err": null}
8
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S512_H32_D128_R64", "batch": 1, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08743200010030705, "p50": 0.08830199999465549, "p90": 0.08903200000531797, "mean": 0.08846400005495525, "iqr": 0.0009099999260797631, "raw_times": [0.08830199999465549, 0.08903200000531797, 0.08943200009525754, 0.0881220000792382, 0.08743200010030705], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09105200001613412, "peak_bytes": 100925440, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5959869870130206e-08, "mae_k": 1.588083975434529e-08, "mse_q": 2.510663677418633e-15, "mse_k": 2.502786271009168e-15, "ref": "rotary_torch"}, "err": null}
9
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08645200000501063, "p50": 0.08729199998924742, "p90": 0.08832200001052115, "mean": 0.08765380002841994, "iqr": 0.0011610000001383014, "raw_times": [0.08645200000501063, 0.08904200012693764, 0.08832200001052115, 0.08716100001038285, 0.08729199998924742], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08923200016397459, "peak_bytes": 51118080, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5894533689220225e-08, "mae_k": 1.5873395042831362e-08, "mse_q": 2.5093181655819197e-15, "mse_k": 2.488611809911578e-15, "ref": "rotary_torch"}, "err": null}
10
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H8_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08737200005271006, "p50": 0.08886199998414668, "p90": 0.08923200016397459, "mean": 0.08870620004017837, "iqr": 0.0006990001111262245, "raw_times": [0.08737200005271006, 0.08953199994721217, 0.08886199998414668, 0.08853300005284837, 0.08923200016397459], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.08981199994195777, "peak_bytes": 101711872, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5936768349433805e-08, "mae_k": 1.5960043953100467e-08, "mse_q": 2.51039008577667e-15, "mse_k": 2.5111253103748867e-15, "ref": "rotary_torch"}, "err": null}
11
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D64_R32", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08698100009496557, "p50": 0.0875120001637697, "p90": 0.08985099998426449, "mean": 0.08861960000103863, "iqr": 0.0027790001695393585, "raw_times": [0.0875120001637697, 0.08985099998426449, 0.08698100009496557, 0.09168199994746828, 0.08707199981472513], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09252200015907874, "peak_bytes": 201850880, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 1.9073486328125e-06, "absmax_k": 9.5367431640625e-07, "mae_q": 1.586510300910504e-08, "mae_k": 1.5813935050346117e-08, "mse_q": 2.499836478770355e-15, "mse_k": 2.4755639026338358e-15, "ref": "rotary_torch"}, "err": null}
12
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B1_S2048_H32_D128_R64", "batch": 1, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.2610050000839692, "p50": 0.261626000110482, "p90": 0.26182599981439125, "mean": 0.261735599997337, "iqr": 0.0002109998149535386, "raw_times": [0.2616149999994377, 0.2610050000839692, 0.261626000110482, 0.26182599981439125, 0.2626059999784047], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.26071599995702854, "peak_bytes": 403701760, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.581049247079136e-08, "mae_k": 1.5861061797295406e-08, "mse_q": 2.4735094242202705e-15, "mse_k": 2.486832828964107e-15, "ref": "rotary_torch"}, "err": null}
13
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08662100003675732, "p50": 0.08793199981482758, "p90": 0.08872200010046072, "mean": 0.08787560000200756, "iqr": 0.0014409999948838959, "raw_times": [0.08662100003675732, 0.08872200010046072, 0.08882199995241535, 0.08793199981482758, 0.08728100010557682], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0905719998627319, "peak_bytes": 137396224, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5824980437173508e-08, "mae_k": 1.5615324144846454e-08, "mse_q": 2.488090249374306e-15, "mse_k": 2.425079044911585e-15, "ref": "rotary_torch"}, "err": null}
14
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H8_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08488200001011137, "p50": 0.08702200011612149, "p90": 0.08860200000526675, "mean": 0.08870799997566792, "iqr": 0.002460000132487039, "raw_times": [0.08488200001011137, 0.08702200011612149, 0.08860200000526675, 0.0861419998727797, 0.09689199987406028], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09050199992088892, "peak_bytes": 12648448, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5683587761827766e-08, "mae_k": 1.574532682013796e-08, "mse_q": 2.4310271220254415e-15, "mse_k": 2.4601385856313877e-15, "ref": "rotary_torch"}, "err": null}
15
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D64_R32", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08632200001557067, "p50": 0.08908199993129529, "p90": 0.08911199984140694, "mean": 0.08852599994497723, "iqr": 0.0001399998836859595, "raw_times": [0.08632200001557067, 0.08914199997889227, 0.08911199984140694, 0.08908199993129529, 0.08897199995772098], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09170199996333395, "peak_bytes": 25198592, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5835009747888762e-08, "mae_k": 1.572560215379326e-08, "mse_q": 2.478222950813504e-15, "mse_k": 2.4541699679685603e-15, "ref": "rotary_torch"}, "err": null}
16
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S128_H32_D128_R64", "batch": 2, "seqlen": 128, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08825200006867817, "p50": 0.08947200012698886, "p90": 0.08949199991548085, "mean": 0.08924200005822058, "iqr": 0.00026999987312592566, "raw_times": [0.08825200006867817, 0.08977200013760012, 0.08949199991548085, 0.08947200012698886, 0.08922200004235492], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09157099998446938, "peak_bytes": 50397184, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5876850056883995e-08, "mae_k": 1.5927410501603845e-08, "mse_q": 2.504224532953606e-15, "mse_k": 2.503892919554756e-15, "ref": "rotary_torch"}, "err": null}
17
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08679200004735321, "p50": 0.08709200005796447, "p90": 0.08838100006869354, "mean": 0.08772180003688845, "iqr": 0.0012890000107290689, "raw_times": [0.08709200005796447, 0.08679200004735321, 0.08925199995246658, 0.08709200005796447, 0.08838100006869354], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09222199992109381, "peak_bytes": 25362432, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5820052823301012e-08, "mae_k": 1.580205122309053e-08, "mse_q": 2.4876468276264184e-15, "mse_k": 2.4866062476507165e-15, "ref": "rotary_torch"}, "err": null}
18
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H8_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08846200012158079, "p50": 0.08976200001598045, "p90": 0.09003199988910637, "mean": 0.0897020000138582, "iqr": 0.00085999977272877, "raw_times": [0.08846200012158079, 0.09003199988910637, 0.08976200001598045, 0.0891720001163776, 0.09108199992624577], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0899420001587714, "peak_bytes": 50593792, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5823172105911e-08, "mae_k": 1.582038855474366e-08, "mse_q": 2.464257071579175e-15, "mse_k": 2.4775099608301526e-15, "ref": "rotary_torch"}, "err": null}
19
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D64_R32", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08725199995751609, "p50": 0.08807200015326089, "p90": 0.08955199996307783, "mean": 0.08866800003488606, "iqr": 0.0017499999103165464, "raw_times": [0.08780200005276129, 0.08807200015326089, 0.08955199996307783, 0.08725199995751609, 0.09066200004781422], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.10307200000170269, "peak_bytes": 100794368, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5888783622131086e-08, "mae_k": 1.5861886026868888e-08, "mse_q": 2.4766798685418433e-15, "mse_k": 2.475923891636419e-15, "ref": "rotary_torch"}, "err": null}
20
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S512_H32_D128_R64", "batch": 2, "seqlen": 512, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.0861920000261307, "p50": 0.08893199992598966, "p90": 0.08899199997358664, "mean": 0.08807199997136195, "iqr": 0.0017600000319362152, "raw_times": [0.0861920000261307, 0.08723199994165043, 0.08893199992598966, 0.0890119999894523, 0.08899199997358664], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09088199999496283, "peak_bytes": 201588736, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5826390864503992e-08, "mae_k": 1.5792682717119533e-08, "mse_q": 2.480465258783123e-15, "mse_k": 2.475580631534544e-15, "ref": "rotary_torch"}, "err": null}
21
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08800199998404423, "p50": 0.08883200007403502, "p90": 0.08927299995775684, "mean": 0.08933019998949021, "iqr": 0.0010710000424296595, "raw_times": [0.08800199998404423, 0.08883200007403502, 0.09234200001628778, 0.08820199991532718, 0.08927299995775684], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.09017200000016601, "peak_bytes": 101449728, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.592899323554775e-08, "mae_k": 1.5925031959795888e-08, "mse_q": 2.50783882253954e-15, "mse_k": 2.5015648494992274e-15, "ref": "rotary_torch"}, "err": null}
22
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H8_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 8, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.08575199990445981, "p50": 0.08742199997868738, "p90": 0.08754200007388135, "mean": 0.08730620002097567, "iqr": 0.00023999996301427018, "raw_times": [0.08754200007388135, 0.08730200011086708, 0.08575199990445981, 0.0885130000369827, 0.08742199997868738], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.0908419999632315, "peak_bytes": 202375168, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.590209919299923e-08, "mae_k": 1.590130160877834e-08, "mse_q": 2.4971026799330918e-15, "mse_k": 2.506967649153289e-15, "ref": "rotary_torch"}, "err": null}
23
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D64_R32", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 64, "rotary_dim": 32, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.25617599999350205, "p50": 0.2583450000201992, "p90": 0.2584750000096392, "mean": 0.2583615999810718, "iqr": 0.0005590000000665896, "raw_times": [0.2579160000095726, 0.2583450000201992, 0.25617599999350205, 0.2608959998724458, 0.2584750000096392], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.2549850000832521, "peak_bytes": 403177472, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.5847520629108658e-08, "mae_k": 1.5862454461057496e-08, "mse_q": 2.4917348203881045e-15, "mse_k": 2.491306009958557e-15, "ref": "rotary_torch"}, "err": null}
24
+ {"ts": "2025-12-19T23:02:26Z", "run": "93508ee044aa4a8e9e1214faa4421b45", "impl": "hf_kernels_rotary", "tags": {"family": "hf-kernels", "backend": "cuda"}, "wl": {"name": "cuda_B2_S2048_H32_D128_R64", "batch": 2, "seqlen": 2048, "num_heads": 32, "head_dim": 128, "rotary_dim": 64, "dtype": "float32", "device": "cuda", "seed": 0}, "env": {"torch": "2.8.0+cu128", "cuda": "12.8", "gpu": "NVIDIA L40S", "sm": "8.9", "py": "3.11.14", "plat": "Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35"}, "lat_ms": {"p10": 0.8457680000901746, "p50": 0.8511980001912889, "p90": 0.8513080001648632, "mean": 0.8505584000886302, "iqr": 0.003619000153776142, "raw_times": [0.8513080001648632, 0.8457680000901746, 0.847689000011087, 0.8511980001912889, 0.856828999985737], "has_warnings": false, "reps": 5, "warmup": 2}, "compile_ms": 0.8516880000115634, "peak_bytes": 806354944, "ok": true, "absmax": null, "corr": {"ok": true, "rtol": 1e-05, "atol": 1e-05, "absmax_q": 9.5367431640625e-07, "absmax_k": 9.5367431640625e-07, "mae_q": 1.585225106737198e-08, "mae_k": 1.581303976649906e-08, "mse_q": 2.4866460581992374e-15, "mse_k": 2.4721545950211372e-15, "ref": "rotary_torch"}, "err": null}
rotary/impls/hf_kernels_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/impls/torch_rotary.html CHANGED
The diff for this file is too large to render. See raw diff
 
rotary/index.html CHANGED
@@ -3874,7 +3874,7 @@ body[data-tool="eraser"] .main-content {
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
- Linux x86_64 | Linux-6.12.58-82.121.amzn2023.x86_64-x86_64-with-glibc2.35
3878
  </div>
3879
  </div>
3880
 
 
3874
  <div class="system-info">
3875
  <div class="system-info-header">Generated on:</div>
3876
  <div class="system-info-content">
3877
+ Darwin arm64 | macOS-15.7.2-arm64-arm-64bit
3878
  </div>
3879
  </div>
3880