| [general] |
| name = "layer_norm" |
| universal = false |
|
|
| [torch] |
| src = [ |
| "torch-ext/torch_binding.cpp", |
| "torch-ext/torch_binding.h", |
| ] |
|
|
| [kernel.layer_norm] |
| depends = ["torch"] |
| backend = "cuda" |
| cuda-capabilities = [ |
| "8.0", |
| "8.9", |
| "9.0", |
| "10.0", |
| "12.0", |
| ] |
| include = ["."] |
| src = [ |
| "layer_norm/ln.h", |
| "layer_norm/ln_api.cpp", |
| "layer_norm/ln_bwd_1024.cu", |
| "layer_norm/ln_bwd_1280.cu", |
| "layer_norm/ln_bwd_1536.cu", |
| "layer_norm/ln_bwd_2048.cu", |
| "layer_norm/ln_bwd_256.cu", |
| "layer_norm/ln_bwd_2560.cu", |
| "layer_norm/ln_bwd_3072.cu", |
| "layer_norm/ln_bwd_4096.cu", |
| "layer_norm/ln_bwd_512.cu", |
| "layer_norm/ln_bwd_5120.cu", |
| "layer_norm/ln_bwd_6144.cu", |
| "layer_norm/ln_bwd_7168.cu", |
| "layer_norm/ln_bwd_768.cu", |
| "layer_norm/ln_bwd_8192.cu", |
| "layer_norm/ln_bwd_kernels.cuh", |
| "layer_norm/ln_fwd_1024.cu", |
| "layer_norm/ln_fwd_1280.cu", |
| "layer_norm/ln_fwd_1536.cu", |
| "layer_norm/ln_fwd_2048.cu", |
| "layer_norm/ln_fwd_256.cu", |
| "layer_norm/ln_fwd_2560.cu", |
| "layer_norm/ln_fwd_3072.cu", |
| "layer_norm/ln_fwd_4096.cu", |
| "layer_norm/ln_fwd_512.cu", |
| "layer_norm/ln_fwd_5120.cu", |
| "layer_norm/ln_fwd_6144.cu", |
| "layer_norm/ln_fwd_7168.cu", |
| "layer_norm/ln_fwd_768.cu", |
| "layer_norm/ln_fwd_8192.cu", |
| "layer_norm/ln_fwd_kernels.cuh", |
| "layer_norm/ln_kernel_traits.h", |
| "layer_norm/ln_parallel_bwd_1024.cu", |
| "layer_norm/ln_parallel_bwd_1280.cu", |
| "layer_norm/ln_parallel_bwd_1536.cu", |
| "layer_norm/ln_parallel_bwd_2048.cu", |
| "layer_norm/ln_parallel_bwd_256.cu", |
| "layer_norm/ln_parallel_bwd_2560.cu", |
| "layer_norm/ln_parallel_bwd_3072.cu", |
| "layer_norm/ln_parallel_bwd_4096.cu", |
| "layer_norm/ln_parallel_bwd_512.cu", |
| "layer_norm/ln_parallel_bwd_5120.cu", |
| "layer_norm/ln_parallel_bwd_6144.cu", |
| "layer_norm/ln_parallel_bwd_7168.cu", |
| "layer_norm/ln_parallel_bwd_768.cu", |
| "layer_norm/ln_parallel_bwd_8192.cu", |
| "layer_norm/ln_parallel_fwd_1024.cu", |
| "layer_norm/ln_parallel_fwd_1280.cu", |
| "layer_norm/ln_parallel_fwd_1536.cu", |
| "layer_norm/ln_parallel_fwd_2048.cu", |
| "layer_norm/ln_parallel_fwd_256.cu", |
| "layer_norm/ln_parallel_fwd_2560.cu", |
| "layer_norm/ln_parallel_fwd_3072.cu", |
| "layer_norm/ln_parallel_fwd_4096.cu", |
| "layer_norm/ln_parallel_fwd_512.cu", |
| "layer_norm/ln_parallel_fwd_5120.cu", |
| "layer_norm/ln_parallel_fwd_6144.cu", |
| "layer_norm/ln_parallel_fwd_7168.cu", |
| "layer_norm/ln_parallel_fwd_768.cu", |
| "layer_norm/ln_parallel_fwd_8192.cu", |
| "layer_norm/ln_parallel_residual_bwd_kernels.cuh", |
| "layer_norm/ln_parallel_residual_fwd_kernels.cuh", |
| "layer_norm/ln_utils.cuh", |
| "layer_norm/static_switch.h" |
| ] |
| cxx-flags = ["-DFLASHATTENTION_DISABLE_PYBIND", "-mcmodel=large"] |
| cuda-flags = [ |
| "-O3", |
| "-U__CUDA_NO_HALF_OPERATORS__", |
| "-U__CUDA_NO_HALF_CONVERSIONS__", |
| "-U__CUDA_NO_BFLOAT16_OPERATORS__", |
| "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", |
| "-U__CUDA_NO_BFLOAT162_OPERATORS__", |
| "-U__CUDA_NO_BFLOAT162_CONVERSIONS__", |
| "--expt-relaxed-constexpr", |
| "--expt-extended-lambda", |
| "--use_fast_math", |
| ] |
|
|
|
|