diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..e6f10209dce527bab07de69f8177f1cea47ee6ac --- /dev/null +++ b/.gitattributes @@ -0,0 +1,49 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text +build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_143103b.abi3.so filter=lfs diff=lfs merge=lfs -text diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..d943bf949eafb80e00f77b26fdb620957cab218f --- /dev/null +++ b/benchmarks/benchmark.py @@ -0,0 +1,9 @@ +from kernels.benchmarks import LayerNormBenchmark, RMSNormBenchmark + + +class LayerNorm(LayerNormBenchmark): + pass + + +class RMSNorm(RMSNormBenchmark): + pass diff --git a/build/torch210-cxx11-cu126-aarch64-linux/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..ea915293e3ca58da096a42951be6cf9683d44cb1 --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4ad4d5e8dee889d696c0b60ac2aa7c24586412b5605a33722f45a13a077b88d +size 711710472 diff --git a/build/torch210-cxx11-cu126-aarch64-linux/_ops.py b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch210-cxx11-cu126-aarch64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu126-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu126-aarch64-linux/layers.py b/build/torch210-cxx11-cu126-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu126-aarch64-linux/metadata.json b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fd730a50e49f5ef3cc7ec34adeda65a8f6067c66 --- /dev/null +++ b/build/torch210-cxx11-cu126-aarch64-linux/metadata.json @@ -0,0 +1,13 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch210-cxx11-cu126-x86_64-linux/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..dc8c2c807725336ed37017092beb4d789de410ff --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e466713afb0ca13d2c60a66eae845ddfcaf1ac98e5297d52068e54da831564c4 +size 712093824 diff --git a/build/torch210-cxx11-cu126-x86_64-linux/_ops.py b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch210-cxx11-cu126-x86_64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu126-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu126-x86_64-linux/layers.py b/build/torch210-cxx11-cu126-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu126-x86_64-linux/metadata.json b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fd730a50e49f5ef3cc7ec34adeda65a8f6067c66 --- /dev/null +++ b/build/torch210-cxx11-cu126-x86_64-linux/metadata.json @@ -0,0 +1,13 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch210-cxx11-cu128-aarch64-linux/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..e163dd3d22e61b39a306def0a4ae132deba33abf --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5417fb89bb13567875514abfde1d4a087bb0295de1f53aaaf20edb0eb562c1 +size 1231083200 diff --git a/build/torch210-cxx11-cu128-aarch64-linux/_ops.py b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch210-cxx11-cu128-aarch64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu128-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu128-aarch64-linux/layers.py b/build/torch210-cxx11-cu128-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu128-aarch64-linux/metadata.json b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch210-cxx11-cu128-aarch64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch210-cxx11-cu128-x86_64-linux/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..f5cf30f03d044b3a39d9bf402696fe316bd7f733 --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ac7b943c6c28ccdfd408ef86ccc7ae5eb21adc6385f8e832141e0cbccd3eecd +size 1231419520 diff --git a/build/torch210-cxx11-cu128-x86_64-linux/_ops.py b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch210-cxx11-cu128-x86_64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu128-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu128-x86_64-linux/layers.py b/build/torch210-cxx11-cu128-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu128-x86_64-linux/metadata.json b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch210-cxx11-cu128-x86_64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch210-cxx11-cu130-aarch64-linux/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..c2800769b73af95f17f29b0b1efe37ab277dd73f --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c61421662fd4b22a3eb3d75fa5d9fcd3775ad8d8bbcf9c638b89e427111927b3 +size 1235993064 diff --git a/build/torch210-cxx11-cu130-aarch64-linux/_ops.py b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch210-cxx11-cu130-aarch64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu130-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu130-aarch64-linux/layers.py b/build/torch210-cxx11-cu130-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu130-aarch64-linux/metadata.json b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch210-cxx11-cu130-aarch64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch210-cxx11-cu130-x86_64-linux/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..dfee4bcc5a92cac8b1fcc9a84540dadf3464df63 --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3821d36be629ae880f15967c86f432bb1c26b4d1dfef430d7f905121eaf0781b +size 1238332560 diff --git a/build/torch210-cxx11-cu130-x86_64-linux/_ops.py b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch210-cxx11-cu130-x86_64-linux/layer_norm/__init__.py b/build/torch210-cxx11-cu130-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch210-cxx11-cu130-x86_64-linux/layers.py b/build/torch210-cxx11-cu130-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch210-cxx11-cu130-x86_64-linux/metadata.json b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch210-cxx11-cu130-x86_64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu126-aarch64-linux/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..e7f816958eac46fdfa725848eb30509ea3d260f5 --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d1b35f4d42743357fe2fbd9feb0ce20d28506154325e0745c97aa99d829bda8 +size 711706784 diff --git a/build/torch211-cxx11-cu126-aarch64-linux/_ops.py b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch211-cxx11-cu126-aarch64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu126-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu126-aarch64-linux/layers.py b/build/torch211-cxx11-cu126-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu126-aarch64-linux/metadata.json b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fd730a50e49f5ef3cc7ec34adeda65a8f6067c66 --- /dev/null +++ b/build/torch211-cxx11-cu126-aarch64-linux/metadata.json @@ -0,0 +1,13 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu126-x86_64-linux/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..3624f4d3aea9b4ef58850f06a4f6cb95851d6129 --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa655742dc940fbe87dc0d87a17bbd1dc6a4d7a5fb99ec4d3f0f16bbb875243d +size 712082776 diff --git a/build/torch211-cxx11-cu126-x86_64-linux/_ops.py b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch211-cxx11-cu126-x86_64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu126-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu126-x86_64-linux/layers.py b/build/torch211-cxx11-cu126-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu126-x86_64-linux/metadata.json b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fd730a50e49f5ef3cc7ec34adeda65a8f6067c66 --- /dev/null +++ b/build/torch211-cxx11-cu126-x86_64-linux/metadata.json @@ -0,0 +1,13 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu128-aarch64-linux/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..2585aebb654365cbaeebcd56ed563e5f4a3dffff --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5e69e28fd3ca606b2519b7d64c292a8008db36f8b049d2bde0bf22c2fee0849 +size 1231079512 diff --git a/build/torch211-cxx11-cu128-aarch64-linux/_ops.py b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch211-cxx11-cu128-aarch64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu128-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu128-aarch64-linux/layers.py b/build/torch211-cxx11-cu128-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu128-aarch64-linux/metadata.json b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch211-cxx11-cu128-aarch64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu128-x86_64-linux/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..d8486dbfcb4b97029597a7f8161fae44e9649a10 --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:047a71d1ae85841cc6d6a1a3fdd70aeda0f5f23ded37e74537d1f21d2f47d670 +size 1231408464 diff --git a/build/torch211-cxx11-cu128-x86_64-linux/_ops.py b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch211-cxx11-cu128-x86_64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu128-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu128-x86_64-linux/layers.py b/build/torch211-cxx11-cu128-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu128-x86_64-linux/metadata.json b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch211-cxx11-cu128-x86_64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu130-aarch64-linux/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..a56d13dfc06c482ca9f89cae8b3f4b31a8710129 --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3147d8b7ea6ae20c72894191e79992c1825505e22b12b9fdf6df9d666504ab82 +size 1235989376 diff --git a/build/torch211-cxx11-cu130-aarch64-linux/_ops.py b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch211-cxx11-cu130-aarch64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu130-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu130-aarch64-linux/layers.py b/build/torch211-cxx11-cu130-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu130-aarch64-linux/metadata.json b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch211-cxx11-cu130-aarch64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch211-cxx11-cu130-x86_64-linux/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..af9b2d7460615785076d1e209fb882954ba9375d --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0eb22c7b8606869f2aaeb4e5163bfae654ab5a405b123a102cf6568fda508ca0 +size 1238325592 diff --git a/build/torch211-cxx11-cu130-x86_64-linux/_ops.py b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__init__.py b/build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch211-cxx11-cu130-x86_64-linux/layers.py b/build/torch211-cxx11-cu130-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch211-cxx11-cu130-x86_64-linux/metadata.json b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch211-cxx11-cu130-x86_64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch29-cxx11-cu129-aarch64-linux/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..0fb5b4a1f4a1d7e4b4175f2c189353ef682506e5 --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5edebe8b0f52b1c169e80f3502abb83cfc4cf2438b5771387e8dfd9254eef1a8 +size 1282721000 diff --git a/build/torch29-cxx11-cu129-aarch64-linux/_ops.py b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch29-cxx11-cu129-aarch64-linux/layer_norm/__init__.py b/build/torch29-cxx11-cu129-aarch64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch29-cxx11-cu129-aarch64-linux/layers.py b/build/torch29-cxx11-cu129-aarch64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch29-cxx11-cu129-aarch64-linux/metadata.json b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch29-cxx11-cu129-aarch64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +} diff --git a/build/torch29-cxx11-cu129-x86_64-linux/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..946160f16b9dc91fefeea037fb7ac84fd6afd802 --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/__init__.py @@ -0,0 +1,26 @@ +import torch +import torch.nn as nn + +from ._ops import ops + +from . import layers + +def dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_fwd(input, gamma, beta, rowscale, colscale, x0_subset, z_subset, dropout_p, epsilon, rowscale_const, z_numrows, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm): + return ops.dropout_add_ln_bwd(dz, dx, x, mu, rsigma, gamma, rowscale, colscale, x0_subset, z_subset, dropout_p, rowscale_const, x0_numrows, has_residual, is_rms_norm) + +def dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_fwd(input, gamma0, beta0, gamma1, beta1, dropout_p, epsilon, gen, residual_in_fp32, is_rms_norm) + +def dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm): + return ops.dropout_add_ln_parallel_residual_bwd(dz0, dz1, dx, x, mu, rsigma, gamma0, gamma1, dropout_p, has_x1, has_residual, is_rms_norm) + +__all__ = [ + "layers", + "dropout_add_ln_fwd", + "dropout_add_ln_bwd", + "dropout_add_ln_parallel_residual_fwd", + "dropout_add_ln_parallel_residual_bwd", +] \ No newline at end of file diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_143103b.abi3.so b/build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_143103b.abi3.so new file mode 100644 index 0000000000000000000000000000000000000000..a982e905c881aa0e48c677a89779007de2abed12 --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/_layer_norm_cuda_143103b.abi3.so @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d0e08047c46ba37ec87cdd6ebd77249d8e347737318de96cdc011bad18dd61a +size 1283022120 diff --git a/build/torch29-cxx11-cu129-x86_64-linux/_ops.py b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a00eb69561f0e1b8ca51a5c60d13d9e6a86097c1 --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/_ops.py @@ -0,0 +1,9 @@ +import torch +from . import _layer_norm_cuda_143103b +ops = torch.ops._layer_norm_cuda_143103b + +def add_op_namespace_prefix(op_name: str): + """ + Prefix op by namespace. + """ + return f"_layer_norm_cuda_143103b::{op_name}" diff --git a/build/torch29-cxx11-cu129-x86_64-linux/layer_norm/__init__.py b/build/torch29-cxx11-cu129-x86_64-linux/layer_norm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b2672c1cd85b74c1b3ded0fc0b2100e1aeac23 --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/layer_norm/__init__.py @@ -0,0 +1,26 @@ +import ctypes +import importlib.util +import sys +from pathlib import Path +from types import ModuleType + + +def _import_from_path(file_path: Path) -> ModuleType: + # We cannot use the module name as-is, after adding it to `sys.modules`, + # it would also be used for other imports. So, we make a module name that + # depends on the path for it to be unique using the hex-encoded hash of + # the path. + path_hash = "{:x}".format(ctypes.c_size_t(hash(file_path.absolute())).value) + module_name = path_hash + spec = importlib.util.spec_from_file_location(module_name, file_path) + if spec is None: + raise ImportError(f"Cannot load spec for {module_name} from {file_path}") + module = importlib.util.module_from_spec(spec) + if module is None: + raise ImportError(f"Cannot load module {module_name} from spec") + sys.modules[module_name] = module + spec.loader.exec_module(module) # type: ignore + return module + + +globals().update(vars(_import_from_path(Path(__file__).parent.parent / "__init__.py"))) diff --git a/build/torch29-cxx11-cu129-x86_64-linux/layers.py b/build/torch29-cxx11-cu129-x86_64-linux/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..7ed883f42ead452f8b60f498ec11302c53d3cf74 --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/layers.py @@ -0,0 +1,51 @@ +import torch +import torch.nn as nn + +from ._ops import ops + + +class LayerNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = False, + ) + return output[0].view(hidden_states.shape) + +class LlamaRMSNorm(nn.Module): + weight: torch.Tensor + variance_epsilon: float + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + output = ops.dropout_add_ln_fwd( + hidden_states.view(-1, hidden_states.shape[-1]), + gamma = self.weight, + beta = None, + rowscale = None, + colscale = None, + x0_subset = None, + z_subset = None, + dropout_p = 0, + epsilon = self.variance_epsilon, + rowscale_const = 1.0, + z_numrows = hidden_states.shape[1], + gen = None, + residual_in_fp32 = False, + is_rms_norm = True, + ) + return output[0].view(hidden_states.shape) \ No newline at end of file diff --git a/build/torch29-cxx11-cu129-x86_64-linux/metadata.json b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..5bf2743043146a22147fad17302989accb0b505c --- /dev/null +++ b/build/torch29-cxx11-cu129-x86_64-linux/metadata.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "license": "BSD-3-Clause", + "python-depends": [], + "backend": { + "type": "cuda", + "archs": [ + "10.0", + "12.0", + "8.0", + "8.9", + "9.0" + ] + } +}